View Raw SPL
/*****************************************************************************
*                                                                            *
*   PERCENTILE.SPL Copyright (C) 2007 DSP Development Corporation            *
*                               All Rights Reserved                          *
*                                                                            *
*   Author:      Randy Race                                                  *
*                                                                            *
*   Synopsis:    Estimates the percentile of a series                        *
*                                                                            *
*   Revisions:    3 Apr 2007  RRR  Creation                                  *
*                                                                            *
*****************************************************************************/

#include 


#if @HELP_PERCENTILE

    PERCENTILE

    Purpose: Estimates the percentile of a series.

    Syntax:  PERCENTILE(s, p, method)

             (y, p) = PERCENTILE(s, p, method)

                     s - A series, the input population

                     p - A real or series. The percentile as a decimal
                         fraction where 0 <= p <= 1.

                method - Optional, an integer. The percentile estimation
                         method. Defaults to 5, averaged empirical
                         distribution function. For an input series of
                         length N, the following methods are available:

                          1: Weighted Average at N*p
                          2: Closest Observation
                          3: Empirical Distribution Function
                          4: Weighted Average at (N+1)*p
                          5: Empirical Distribution Function with Averaging
                          6: Empirical Distribution Function with Interpolation


    Returns: An XY series or scalar, the estimated percentiles.

             (y, p) = percentile(s, p, method) returns the percentile
             values and decimal percentages as separate values.

    Example:
             p1 = percentile({1, 2, 3, 4, 5}, 0.9)
             p2 = percentile({1, 2, 3, 4, 5}, 0.9, 6)

             p1 == 5
             p2 == 4.6

             The 90% percentile is calculated for the series using two
             methods.

             p1 is calculated using the default Empirical Distribution
             Function with Averaging method. This is the default
             method used by SAS and other statistical packages.

             p2 is calculated using the Empirical Distribution
             Function with Interpolation method, the method used by Excel.

    Example:
             W1: gnorm(1000,1)
             W2: percentile(w1, 0.01..0.01..0.99)

             W2 is a 99 point XY series that estimates the percentile
             values of W1 from 1% to 99% in steps of 1%.

    Remarks:
             The percentile function provides estimates of proportions
             of the data that should fall above and below a given
             value.  Given p as a decimal fraction between 0 and 1, the
             pth percentile is a value such that at most (100*p)% of
             the observations are less than this value and that at most
             100*(1 - p)% are greater.

             Thus:

               The 1st percentile cuts off lowest 1% of data.

               The 98th percentile cuts off lowest 98% of data.

               The 25th percentile is the first quartile.

               The 50th percentile is the median.


             The METHOD parameter specifies the procedure to compute
             percentiles. Let N be the number of non-missing values for
             a variable, and let x1, x2, ..., xN represent the ordered
             values of the variable such that x1 is the smallest value
             and xN is the largest value. For p the percentile as a
             decimal fraction between 0 and 1, the result y is computed
             for each method as follows:

             ------------------------------------------------------------------

             Method 1, Weighted Average at x
                                            Np

             y = (1 - g) * x + g * x
                            j       j+1

             where j is the integer part of N*p and g the fractional part

             ------------------------------------------------------------------

             Method 2, Closest Value to (N-1)*p+1

              y = x    for g < 0.5
                   j

              y = x    for g >= 0.5
                   j+1

             where j is the integer part of (N-1)*p+1 and g the fractional part

             ------------------------------------------------------------------

             Method 3, Empirical Distribution Function

             y = x    for g == 0
                  j

             y = x    for g > 0
                  j+1

             where j is the integer part of N*p and g the fractional part

             ------------------------------------------------------------------

             Method 4, Weighted Average at x
                                            (N+1)p
             y = (1 - g) * x + g * x
                            j       j+1

             where j is the integer part of (N+1)*p and g the fractional part

             ------------------------------------------------------------------

             Method 5, Empirical Distribution Function with Averaging (default)

             y = 0.5 * (x  + x    ) for g == 0
                         j     j+1
             y = x                  for g > 0
                  j+1

             where j is the integer part of N*p and g the fractional part

             ------------------------------------------------------------------

             Method 6, Empirical Distribution Function with Interpolation

             y = x    + g * (x   - x   )
                  j+1         j+2   j+1

             where j is the integer part of (N-1)*p+1 and g the fractional part

             ------------------------------------------------------------------

             The default method is 5, Empirical Distribution Function
             with Averaging.

             The method parameters follow the format available with SAS.

             Excel and S-Plus use method 6.

             Minitab and SPSS use method 4.

             The percentile p is specified as a decimal fraction such that
             0 <= p <= 1.0 where 1.0 represents 100 percent.

             If p is a series, PERCENTILE returns an XY series where
             the X values are the input decimal percentages. Use

                 (y, p) = percentile(s, p, method)

             to return the percentile estimate y as a separate interval
             series.

    See Also:
             Sort
             Xylookup
#endif


/* find pth percentile of a series */
ITERATE percentile(s, p, method, tol)
{
        local nn;

        if (argc < 4)
        {
                if (argc < 3)
                {
                        if (argc < 2) error("percentile - input series and percentage between 0.0 and 1.0 required");
                        
                        method = 5;
                }
                
                tol = 1e-9;
        }

        /* remove missing values if necessary */
        if (max(nn = isnavalue(s)) == 1)
        {
                s = delete(s, nn);
        }

        if (isarray(p))
        {
                if (max(nn = isnavalue(p)) == 1)
                {
                        p = delete(p, nn);
                }
        }

        switch (method)
        {
                case 1: /* weighted average at n*p */
                        v = percent_wn(s, p);
                        break;

                case 2: /* closest observation */
                        v = percent_closest(s, p);
                        break;

                case 3: /* empirical distribution function */
                        v = percent_edf(s, p, tol);
                        break;

                case 4: /* weighted average at (n+1)*p */
                        v = percent_wn1(s, p);
                        break;

                case 5: /* /* empirical distribution function with averaging */
                        v = percent_edfa(s, p, tol);
                        break;

                case 6: /* empirical distribution function with interpolation */
                        v = percent_edfi(s, p);
                        break;

                default:
                        error(sprintf("percentile - unknown method %d", method));
                        break;
        }

        if (isarray(v))
        {
                v = yvals(v);
        }

        if (outargc > 1)
        {
                return(v, p);
        }
        else
        {
                if (isarray(v))
                {
                        return(xy(p, v));
                }
                else
                {
                        return(v);
                }
        }
}


/* empirical distribution function */
percent_edf(s, p, tol)
{
        local t, v;

        if (argc < 3)
        {
                tol = 1e-9;
        }

        t = sort(s, 1);
        setxoffset(t, 1.0);
        setdeltax(t, 1.0);

        p = ceil(p * length(t) - tol);

        v = xylookup(t, p, "none", 1);
        return(v);
}


/* empirical distribution function with averaging */
percent_edfa(s, p, tol)
{
        local t, v, N, g, k, scalarval;

        if (argc < 3)
        {
                tol = 1e-9;
        }

        t = sort(s, 1);
        N = length(t);
        setxoffset(t, 1.0);
        setdeltax(t, 1);

        if ((scalarval = isscalar(p)))
        {
                /* cast to series */
                p = castseries(p);
        }
        
        p2 = p * N;
        g = p2 - int(p2);

        p2 = ceil(p2);

        k = find(g <= tol);
        
        if (length(k) > 0)
        {
                p2[k] = floor(p[k] * N) + 0.5;
        }

        if (scalarval)
        {
                p2 = castreal(p2);
        }
        
        v = xylookup(t, p2, "linear", 1);
        
        return(v);
}


/* weighted average at n+1 - Minitab and SPSS */
percent_wn1(s, p)
{
        local t, v;

        t = sort(s, 1);
        setxoffset(t, 1);
        setdeltax(t, 1);

        p = p * (length(t) + 1);

        v = xylookup(t, p, "linear", 1);
        
        return(v);
}


/* weighted average at n */
percent_wn(s, p)
{
        local t, v;

        t = sort(s, 1);
        setxoffset(t, 1);
        setdeltax(t, 1);

        p = p * length(t);

        v = xylookup(t, p, "linear", 1);
        
        return(v);
}


/* closest observation */
percent_closest(s, p)
{
        local t, v;

        t = sort(s, 1);
        setxoffset(t, 1.0);
        setdeltax(t, 1);

        p = round(p * length(t));

        v = xylookup(t, p, "none", 1);
        
        return(v);
}


/* empirical distribution function with interpolation - Excel and S-Plus */
percent_edfi(s, p)
{
        local t, v;

        t = sort(s, 1);
        setxoffset(t, 0);
        setdeltax(t, 1 / (length(t) - 1));

        v = xylookup(t, p, "linear", 1);
        
        return(v);
}