#### Quant 1 #### Week 2 Code library(foreign) # this library needed to load the Stata file dail2002 <- read.dta("dail2002.dta") table(dail2002$party) # beware of datatypes table(dail2002$incumb) class(dail2002$incumb) class(dail2002$party) dail2002$incumbf <- factor(dail2002$incumb, labels=c("Challenger", "Incumbent")) class(dail2002$incumbf) table(dail2002$incumbf) # barplots country <- c("Canada","China","England","Germany","Greece","Other") f <- c(5,7,2,5,3,4) sum(f) barplot(f, names=country) #horizontal: barplot(f, names=country, horiz=T, las=1, cex.names=.9) # ordered horizontal cbind(f,country) (ordering <- order(f)) barplot(f[ordering], names=country[ordering], horiz=T, las=1, cex.names=.9) #bad idea: library(UsingR) barplot(central.park$MAX, names.arg=1:31, xlab="Day", ylab="Max Temperature") #better: plot(1:31, central.park$MAX, type="o", ylim=c(40,80), xlab="Day", ylab="Max Temperature") axis(1, at=1:31) # pie charts are problematic pie(table(party)) #dot charts: #library(foreign) #dail2002 <- read.dta("dail2002.dta") (spendmeans <- tapply(dail2002$spend_total,dail2002$party,mean)) dotchart(spendmeans, xlab="Total Spending in euros") #stem and leaf plot stem(dail2002$votes1st) ### the mode in R x <- c(1, 2, 3, 1, 1, 5, 5, 4, 1, 4, 4, 3) mode(x) # this means something very different in R! table(x) # do a frequency distribution # see which values are the maximum values which(table(x)==max(table(x))) # bimodal distribution x <- c(6, 6, 7, 2, 6, 1, 2, 3, 2, 4) which(table(x)==max(table(x))) ### the median x <- c(1, 5, 6, 7, 7, 8) median(x) # manual solution with odd no of cases x <- c(4, 8, 15, 16, 23, 42, 45, 2, 7, 33, 4, 46, 997) sort(x) length(x) # length of vector is N length(x)%%2 # modulus division: is there are remainder? (length(x)+1)/2 # the position of the median by our formula length(x)%%2==0 # no remainder means this number is ODD median(x) # use R to compute median # manual solution with even no of cases x <- c(x,998) # add one more number to our test vector x # look at the vector sort(x) # look at the vector sorted (length(x)+1)/2 # position of new median value length(x)%%2 (length(x)+1)/2 # the position of the median by our formula length(x)%%2==0 # should be TRUE if even median(x) ### The mean # sensitivity to extreme values a <- c(5,6,6,7,8,9,10,10) b <- c(5,6,6,7,8,9,10,95) median(a) median(b) mean(a) mean(b) # Multimodal distributions #load("dail2002.Rdata") #attach(dail2002) summary(spend_total) mean(spend_total) plot(density(spend_total)) abline(v=mean(spend_total), col="red") abline(v=median(spend_total), col="blue", lty="dashed") # geometric mean prod(10, 1, 1000, 1, 10) prod(10, 1, 1000, 1, 10)^(1/5) exp(1/5 * sum(10, 1, 100, 10)) exp(mean(log(c(10, 1, 1000, 1, 10)))) # harmonic mean x <- c(1,2,4,1) 1/mean(1/x) length(x)/sum(1/x) # weighted mean with example from built-in dataset data(state) # load the built-in state dataset state.x77[1:5,1:2] attach(data.frame(state.x77)) # coerce as data frame and attach mean(Income) weighted.mean(Income, Population) # automatically normalizes weight median(Income) ### percentiles illustrated x <- (round(runif(100)*100)) # create 100 random values 0-100 xsorted <- sort(x) # sort the values xsorted[c(1,25,50,75,100)] # min; 25th, 50th, 75th %tile; max summary(xsorted) # ask R for same information # inter-quartile range IQR(dail2002$spend_total) IQR(dail2002$spend_total, na.rm=TRUE) sd(dail2002$spend_total, na.rm=TRUE) summary(dail2002$spend_total) fivenum(dail2002$spend_total) summary(x) # sorting makes no difference quantile(x,0.25) # 25th percentile using quantile() quantile(x,0.75) # 75th percentile quantile(x,0.50) # 50th percentile (median) median(x) # median (50th percentile) # compute position of 25th %tile using formula (p <- (25/100)*(length(xsorted)+1)) x[floor(p)] # the position before p x[ceiling(p)] # the position after p ### Histograms load("dail2002.Rdata") attach(dail2002) hist(spend_total, xlab="Candidate Spending", main="") ### Kernel density plot plot(density(spend_total)) #combined: hist(spend_total) lines(density(spend_total)) # note the use of lines() ### box plot plot(party, spend_total)