View Raw SPL
/*****************************************************************************
* *
* PERCENTILE.SPL Copyright (C) 2007 DSP Development Corporation *
* All Rights Reserved *
* *
* Author: Randy Race *
* *
* Synopsis: Estimates the percentile of a series *
* *
* Revisions: 3 Apr 2007 RRR Creation *
* *
*****************************************************************************/
#include
#if @HELP_PERCENTILE
PERCENTILE
Purpose: Estimates the percentile of a series.
Syntax: PERCENTILE(s, p, method)
(y, p) = PERCENTILE(s, p, method)
s - A series, the input population
p - A real or series. The percentile as a decimal
fraction where 0 <= p <= 1.
method - Optional, an integer. The percentile estimation
method. Defaults to 5, averaged empirical
distribution function. For an input series of
length N, the following methods are available:
1: Weighted Average at N*p
2: Closest Observation
3: Empirical Distribution Function
4: Weighted Average at (N+1)*p
5: Empirical Distribution Function with Averaging
6: Empirical Distribution Function with Interpolation
Returns: An XY series or scalar, the estimated percentiles.
(y, p) = percentile(s, p, method) returns the percentile
values and decimal percentages as separate values.
Example:
p1 = percentile({1, 2, 3, 4, 5}, 0.9)
p2 = percentile({1, 2, 3, 4, 5}, 0.9, 6)
p1 == 5
p2 == 4.6
The 90% percentile is calculated for the series using two
methods.
p1 is calculated using the default Empirical Distribution
Function with Averaging method. This is the default
method used by SAS and other statistical packages.
p2 is calculated using the Empirical Distribution
Function with Interpolation method, the method used by Excel.
Example:
W1: gnorm(1000,1)
W2: percentile(w1, 0.01..0.01..0.99)
W2 is a 99 point XY series that estimates the percentile
values of W1 from 1% to 99% in steps of 1%.
Remarks:
The percentile function provides estimates of proportions
of the data that should fall above and below a given
value. Given p as a decimal fraction between 0 and 1, the
pth percentile is a value such that at most (100*p)% of
the observations are less than this value and that at most
100*(1 - p)% are greater.
Thus:
The 1st percentile cuts off lowest 1% of data.
The 98th percentile cuts off lowest 98% of data.
The 25th percentile is the first quartile.
The 50th percentile is the median.
The METHOD parameter specifies the procedure to compute
percentiles. Let N be the number of non-missing values for
a variable, and let x1, x2, ..., xN represent the ordered
values of the variable such that x1 is the smallest value
and xN is the largest value. For p the percentile as a
decimal fraction between 0 and 1, the result y is computed
for each method as follows:
------------------------------------------------------------------
Method 1, Weighted Average at x
Np
y = (1 - g) * x + g * x
j j+1
where j is the integer part of N*p and g the fractional part
------------------------------------------------------------------
Method 2, Closest Value to (N-1)*p+1
y = x for g < 0.5
j
y = x for g >= 0.5
j+1
where j is the integer part of (N-1)*p+1 and g the fractional part
------------------------------------------------------------------
Method 3, Empirical Distribution Function
y = x for g == 0
j
y = x for g > 0
j+1
where j is the integer part of N*p and g the fractional part
------------------------------------------------------------------
Method 4, Weighted Average at x
(N+1)p
y = (1 - g) * x + g * x
j j+1
where j is the integer part of (N+1)*p and g the fractional part
------------------------------------------------------------------
Method 5, Empirical Distribution Function with Averaging (default)
y = 0.5 * (x + x ) for g == 0
j j+1
y = x for g > 0
j+1
where j is the integer part of N*p and g the fractional part
------------------------------------------------------------------
Method 6, Empirical Distribution Function with Interpolation
y = x + g * (x - x )
j+1 j+2 j+1
where j is the integer part of (N-1)*p+1 and g the fractional part
------------------------------------------------------------------
The default method is 5, Empirical Distribution Function
with Averaging.
The method parameters follow the format available with SAS.
Excel and S-Plus use method 6.
Minitab and SPSS use method 4.
The percentile p is specified as a decimal fraction such that
0 <= p <= 1.0 where 1.0 represents 100 percent.
If p is a series, PERCENTILE returns an XY series where
the X values are the input decimal percentages. Use
(y, p) = percentile(s, p, method)
to return the percentile estimate y as a separate interval
series.
See Also:
Sort
Xylookup
#endif
/* find pth percentile of a series */
ITERATE percentile(s, p, method, tol)
{
local nn;
if (argc < 4)
{
if (argc < 3)
{
if (argc < 2) error("percentile - input series and percentage between 0.0 and 1.0 required");
method = 5;
}
tol = 1e-9;
}
/* remove missing values if necessary */
if (max(nn = isnavalue(s)) == 1)
{
s = delete(s, nn);
}
if (isarray(p))
{
if (max(nn = isnavalue(p)) == 1)
{
p = delete(p, nn);
}
}
switch (method)
{
case 1: /* weighted average at n*p */
v = percent_wn(s, p);
break;
case 2: /* closest observation */
v = percent_closest(s, p);
break;
case 3: /* empirical distribution function */
v = percent_edf(s, p, tol);
break;
case 4: /* weighted average at (n+1)*p */
v = percent_wn1(s, p);
break;
case 5: /* /* empirical distribution function with averaging */
v = percent_edfa(s, p, tol);
break;
case 6: /* empirical distribution function with interpolation */
v = percent_edfi(s, p);
break;
default:
error(sprintf("percentile - unknown method %d", method));
break;
}
if (isarray(v))
{
v = yvals(v);
}
if (outargc > 1)
{
return(v, p);
}
else
{
if (isarray(v))
{
return(xy(p, v));
}
else
{
return(v);
}
}
}
/* empirical distribution function */
percent_edf(s, p, tol)
{
local t, v;
if (argc < 3)
{
tol = 1e-9;
}
t = sort(s, 1);
setxoffset(t, 1.0);
setdeltax(t, 1.0);
p = ceil(p * length(t) - tol);
v = xylookup(t, p, "none", 1);
return(v);
}
/* empirical distribution function with averaging */
percent_edfa(s, p, tol)
{
local t, v, N, g, k, scalarval;
if (argc < 3)
{
tol = 1e-9;
}
t = sort(s, 1);
N = length(t);
setxoffset(t, 1.0);
setdeltax(t, 1);
if ((scalarval = isscalar(p)))
{
/* cast to series */
p = castseries(p);
}
p2 = p * N;
g = p2 - int(p2);
p2 = ceil(p2);
k = find(g <= tol);
if (length(k) > 0)
{
p2[k] = floor(p[k] * N) + 0.5;
}
if (scalarval)
{
p2 = castreal(p2);
}
v = xylookup(t, p2, "linear", 1);
return(v);
}
/* weighted average at n+1 - Minitab and SPSS */
percent_wn1(s, p)
{
local t, v;
t = sort(s, 1);
setxoffset(t, 1);
setdeltax(t, 1);
p = p * (length(t) + 1);
v = xylookup(t, p, "linear", 1);
return(v);
}
/* weighted average at n */
percent_wn(s, p)
{
local t, v;
t = sort(s, 1);
setxoffset(t, 1);
setdeltax(t, 1);
p = p * length(t);
v = xylookup(t, p, "linear", 1);
return(v);
}
/* closest observation */
percent_closest(s, p)
{
local t, v;
t = sort(s, 1);
setxoffset(t, 1.0);
setdeltax(t, 1);
p = round(p * length(t));
v = xylookup(t, p, "none", 1);
return(v);
}
/* empirical distribution function with interpolation - Excel and S-Plus */
percent_edfi(s, p)
{
local t, v;
t = sort(s, 1);
setxoffset(t, 0);
setdeltax(t, 1 / (length(t) - 1));
v = xylookup(t, p, "linear", 1);
return(v);
}