qqskew.f<-function(data,...)
{
# Q-Q plots for skewed distribution functions (lognormal, Pareto, or
# exponential).
#
# AUTHOR: Chris Hlavka
# NASA/Ames Research Center
# Moffett Field, CA 94035-1000
# chlavka@mail.arc.nasa.gov or chlavka@pacbell.net
#
# PERMISSION is hereby given to Statlib to redistribute this software.
# It can freely be copied,redistributed and used for non-commercial purposes.
#
# This function was developed for use by the U.S. Government as represented
# by the Administrator of the National Aeronautics and Space Administration.
# The software is provided "as is" without warranty. No copyright is claimed in
# the United States on behalf of the U.S. government.
#
# SHORT DESCRIPTION: Generate plots to determine whether skewed
# univariate data fits either a lognormal, Pareto (aka "fractal"
# or "Power"), or exponential distribution.
#
# REQUIRED ARGUMENTS:
#
# data - a vector of positive values.
#
# DETAILED DESCRIPTION: Generate quantile-quantile (QQ) plots for testing
# the fit of the "data" to three common types of skewed distributions: the
# lognormal, Pareto (aka power or fractal), and exponential (aka negative
# exponential) distribution models with the following forms for the
# probability density functions:
#
# lognormal: f(x) = exp{-log[(x/m)^2)/(2*s^2)]}/[x*s*sqrt(2*pi)] for x >=0
# (this is equivalent to saying that log(x) has a Gaussian normal
# distribution with mean m and standard deviation s)
#
# Pareto: f(x) = s*(k^s)/[x^(s+1)] for x >=k, with shape s > 0 and k > 0
#
# exponential: f(x) = exp(-x/m)/b for x >=0, with mean m > 0
#
# Plots of the histogram (linear, semilog, and log-log scale) are also
# generated, and the binning, that is, the data classes, are different
# for the semilog and log-log plots than for the linear plot.
#
# If the "data" fits the specified distribution type, the values on the QQ
# plot will be approximately on a straight line. The highest values will have
# more variation around the line than smaller values due to smaller
# probability density. If "data" within only a certain range comes from a
# Pareto or exponential distribution, then only data in that range will be on
# a straight line. The characteristics of the lognormal QQ-plot
# are more complicated, but if "data" fits the model within a certain range,
# for example if the "data" is from a truncated sample of lognormally
# distributed observations, then the values will be approximately on a
# straight line in the middle of the range. If lognormal data is truncated
# near or above the mode so that most or all values exceed the mode, it
# may resemble and be difficult to distinguish from exponentially
# distributed data.
#
#
# The user selects the distribution type from an interactive menu if in
# interactive mode; quantile-quantile plots for all distribution types
# will be generated in batch mode.
#
# EXAMPLE USAGE:
#
# postscript(horiz=F)
# par(mfrow=c(2,1))
#
# temp<-exp(rnorm(1000)) # lognormal with m = 0 and s = 1
# qqskew.f(temp,main="pseudo-lognormal data")
# qqskew.f(sort(temp)[201:1000],main="truncated pseudo-lognormal")
#
# temp<--log(runif(1000)) # exponential with m = 1
# qqskew.f(temp,main="exponential data")
# qqskew.f(temp[temp>1],main="truncated exponential data")
#
# temp<-(1/runif(1000))^2 # Pareto with shape = 0.5 and k = 1
# qqskew.f(temp,main="Pareto data")
# dev.off()
#
# Note that the semi-log histogram of lognormal data looks Gaussian (normal)
# and the semi-log histogram of Pareto data looks exponential. The log-log
# histogram of the Pareto data looks like a triangle with right angle at
# the origin.
#
# RELATED S FUNCTIONS: qqnorm,qqplot
#
# REFERENCES:
# Chambers, J. M., Cleveland, W. S., Kleiner, B. and Tukey, P.,1983, Graphical
# Methods for Data Analysis, (Belmont, California: Wadsworth International Group
# and Boston: Duxbury Press).
#
# Hastings, N. A. J. and Peacock, J. B., 1975, Statistical Distributions:
# A Handbook for Students and Practitioners, (Toronto: John Wiley & Sons),
# pp. 102-105.
#
# Johnson, N. L. and Kotz, S., 1970, Continuous Univariate Distributions-1
# (New York: # John Wiley & Sons), pp. 112-132 and 233-247.
#
# Hlavka, C. A. and J. L. Dungan, 2002. Areal estimates of fragmented land
# cover - effects of pixel size and model-based corrections. International
# Journal of Remote Sensing 23(4): 711-724
#
#############################################################################
# test input
if (length(data) < 100) stop("insufficient amount of data")
if (length(data[data <=0])> 0) stop("invalid (0 or negative) data")
#
# set plot parameters to save white space
#
par(mgp=c(2,0,0),tck=0.02)
#
# plot histograms
#
vals<-unique(data)
N<-round(min(length(vals),10*log(length(vals)),200)) # number of bars in plots
MIN<-min(vals);MAX<-max(vals)
W<-(MAX - MIN)/N
breaks<-seq(MIN,MAX,W);breaks[1]<-breaks[1] - W/100
Data<-(breaks[1:N] + breaks[2:(N+1)])/2
MIN<-min(log(vals));MAX<-max(log(vals))
W<-(MAX - MIN)/N
frequency<-rep(0,N);data.sum<-rep(0,N) # intialize vectors
lbreaks<-exp(seq(MIN,MAX,W));lbreaks[1]<-0.9999*lbreaks[1]
logData<-(lbreaks[1:N] + lbreaks[2:(N+1)])/2
lfrequency<-rep(0,N);ldata.sum<-rep(0,N)
for (i in 1:N)
{
frequency[i]<-length(data[data > breaks[i] & data <= breaks[i+1]])
lfrequency[i]<-length(data[data > lbreaks[i] & data <= lbreaks[i+1]])
}
plot(Data,frequency,type="n",...)
mtext("histogram",side=3,line=1)
points(Data,frequency,type="h",lwd=5)
Data<-logData;frequency<-lfrequency
Data<-Data[frequency > 0];frequency<-frequency[frequency > 0]
plot(Data,frequency,type="n",log="x",...)
mtext("semi-log histogram",side=3,line=1)
points(Data,frequency,type="h",lwd=5)
plot(Data,frequency,type="n",log="xy",...)
mtext("log-log histogram",side=3,line=1)
points(Data,frequency,type="h",lwd=5)
#
# Q-Q plots
#
N <- length(data)
probs <- seq(0.5/N, (N - 0.5)/N, by = 1/N)
mmenu<-1
if(interactive()) mmenu<-menu(c("all","lognormal","Pareto","exponential"),
title="DISTRIBUTION TYPE")
if (mmenu == 2 | mmenu ==1)
{ qqnorm(log10(data),...)
mtext("QQ plot for lognormal distribution",side=3,line=1)}
if (mmenu == 3 | mmenu == 1)
{Q <- - log(1 - probs)
plot(Q, log10(sort(data)), xlab = "Quantiles of log-Pareto", ...)
mtext("QQ plot for Pareto distribution",side=3,line=1) }
if (mmenu == 4 | mmenu == 1)
{Q <- - log(1 - probs)
plot(Q, sort(data), xlab = "Quantiles of exponential", ...)
mtext("QQ plot for exponential distribution",side=3,line=1)}
}