# UNIVARIATE DATA
# Categorical data
# Page 8
#A survey asks people if they smoke or not. The data is
# Yes, No, No, Yes, Yes
x=c("Yes","No","No","Yes","Yes")
table(x)
# Factors
# Page 9
x=c("Yes","No","No","Yes","Yes")
x # print out values in x
factor(x) # print out value in factor(x)
# notice levels are printed.
# Bar charts
beer = scan()
3 4 1 1 3 4 3 3 1 3 2 1 2 1 2 3 2 3 1 1 1 1 4 3 1
barplot(beer) # this isn't correct
barplot(table(beer)) # Yes, call with summarized data
barplot(table(beer)/length(beer)) # divide by n for proportion
table(beer)/length(beer)
#Pie charts
# Page 10
beer.counts = table(beer) # store the table result
pie(beer.counts) # first pie -- kind of dull
names(beer.counts) = c("domestic\n can","Domestic\n bottle","Microbrew","Import") # give names
pie(beer.counts) # prints out names
pie(beer.counts,col=c("purple","green2","cyan","white")) # now with colors
# Numerical data
# Page 11
sals = scan() # read in with scan
12 .4 5 2 50 8 3 1 4 0.25
mean(sals) # the average
var(sals) # the variance
sd(sals) # the standard deviation
median(sals) # the median
fivenum(sals) # min, lower hinge, Median, upper hinge, max
summary(sals)
#Quantile
data=c(10, 17, 18, 25, 28, 28)
summary(data)
quantile(data,.25)
quantile(data,c(.25,.75)) # two values of p at once
#Hinge
# PAGE 12
sort(sals)
fivenum(sals) # note 1 is the 3rd value, 8 the 8th.
summary(sals) # note 3.25 value is 1/4 way between 1 and 2
# Resistant measures of center and spread
mean(sals,trim=1/10) # trim 1/10 off top and bottom
mean(sals,trim=2/10)
IQR(sals)
mad(sals)
median(abs(sals - median(sals))) # without normalizing constant
median(abs(sals - median(sals))) * 1.4826
# Stem-and-leaf Charts
# Page 13
scores = scan()
2 3 16 23 14 12 4 13 2 0 0 0 6 28 31 14 4 8 2 5
apropos("stem") # What exactly is the name?
stem(scores)
stem(scores,scale=2)
#Numeric to Categorical
sals = c(12, .4, 5, 2, 50, 8, 3, 1, 4, .25) # enter data
cats = cut(sals,breaks=c(0,1,5,max(sals))) # specify the breaks
cats # view the values
table(cats) # organize
cats
levels(cats) = c("poor","rich","rolling in it") # change labels
table(cats)
# Histograms
x=scan()
29.6 28.2 19.6 13.7 13.0 7.8 3.4 2.0 1.9 1.0 0.7 0.4 0.4 0.3 0.3
0.3 0.3 0.3 0.2 0.2 0.2 0.1 0.1 0.1 0.1 0.1
hist(x) # frequencies
hist(x,probability=TRUE) # proportions (or probabilities)
rug(jitter(x)) # add tick marks
hist(x,breaks=10) # 10 breaks, or just hist(x,10)
hist(x,breaks=c(0,1,2,3,4,5,10,20,max(x))) # specify break points
# Boxplots
# Page 16
library("UsingR") # read in library for these notes
data(movies) # read in data set for gross.
names(movies)
attach(movies) # to access the names above
boxplot(current,main="current receipts",horizontal=TRUE)
boxplot(gross,main="gross receipts",horizontal=TRUE)
detach(movies) # tidy up
## library("ts") # load the library ?? no such Library
data("lynx") # load the data
summary(lynx) # Just what is lynx?
# Poor mans Curve Fitting
# Page 18
x = c(.314,.289,.282,.279,.275,.267,.266,.265,.256,.250,.249,.211,.161)
tmp = hist(x) # store the results
lines(c(min(tmp$breaks),tmp$mids,max(tmp$breaks)),c(0,tmp$counts,0),type="l")
Densities
# Page 18
data(faithful)
attach(faithful) # make eruptions visible
hist(eruptions,15,prob=T) # proportions, not frequencies
lines(density(eruptions)) # lines makes a curve, default bandwidth
lines(density(eruptions,bw="SJ"),col='red') # Use SJ bandwidth, in red
lines(density(eruptions,bw=.01),col='blue') # Use FINE bandwidth, in blue
lines(density(eruptions,bw=1),col='green') # Use course bandwidth, in red