# This file contains replication materials for analyses in Kenneth # Benoit, Michael Laver, and Slava Mikhaylov. # 2009. "Treating Words as Data with Error: Estimating Uncertainty in # Text Statements of Policy Positions". American # Journal of Political Science 53 (2, April). # All the analysis in the article can be reproduced by running R script # contained in "blm_ajps_replication.r". In order to produce standard # error estimates for the CMP data you need a CMP dataset. For our # analysis we used a concatenation of datasets published with MPP1 and # MPP2. # Part of our analysis is the replication of two recent AJPS # publications: a) Adams, James, M. Clark, L. Ezrow & # G. Glasgow. 2006. "Are Niche Parties Fundamentally Different from # Mainstream Parties? The Causes and the Electoral Consequences of # Western European Parties' Policy Shifts, 1976-1998." American J of # Political Science 50(3):513-529. b) Hix, Simon, Abdul Noury & Gerard # Roland. 2006. "Dimensions of Politics in the European Parliament." # American Journal of Political Science 50(2, April):494-511. # In order to replicate our replications you also need Adams' et al # (2006) and Hix' et al (2006) replication files. Both datasets are also # available on this website with the kind permission of the authors. # The key outcome of our analysis is to produce standard error estimates # for the CMP data. Our results are contained in # "BLM_CMP_uncertainty.Rdata" and identical Stata dataset # "BLM_CMP_uncertainty.dta". These standard error estimates can and as # we showed should be used in any research that utilizes the CMP # data. Our standard error dataset contains two variables that uniquely # identify the results: "cmp_party" and "cmp_edate". These two unique # identifiers allow merging standard error estimates with any CMP based # dataset. For example, in Stata this is done simply by typing the # following commands: # use " BLM_CMP_uncertainty.dta", clear # merge party edate using "~/Desktop/ YOUR_CMP_DATASET.dta", sort # See Stata manual for further information on -merge- command. ############################################# ### 1. CREATING UNCERTAINTY ESTIMATES FOR ### CMP DATA ############################################# rm(list = ls()) # clear require(foreign) require(Hmisc) require(combinat) require(simex) cmp <- read.dta("mds2005f.dta", convert.factors=FALSE) #cmp <- read.dta("YOUR_CMP_DATASET.dta", convert.factors=FALSE) # sort country edate party cmp <- cmp[order(cmp$country, cmp$edate, cmp$party),] # convert missing data code to NA cmp$peruncod[round(cmp$peruncod,2)==99.99] <- NA cmp$total[round(cmp$total,0)==9999] <- NA # select out cases which are extended cats extendedcats <- apply(cmp[,86:139],1,sum) sum(extendedcats>0) cmp <- cmp[!extendedcats,] # generate frequency matrix cmpN = round(cmp[,30:140]/100*cmp$total,0) dimnames(cmpN)=list(1:nrow(cmpN),sub('per', 'n', names(cmpN))) #We thank Tim Veen for the solution to cross platform compatibility of Hmisc's "translate" function. dimnames(cmpN)[[2]][ncol(cmpN)] <- c("n0") # DROP EXTENDED CATEGORIES cmpN <- cmpN[,c(ncol(cmpN), seq(1,56))] # Put actual dataset into a data frame called "dataWide" dataWide <- cbind(1:nrow(cmp), cmp$country, cmp$party, cmp$edate, cmp$total, cmpN) dimnames(dataWide)[[2]][1] <- c("manifID") dataWide <- data.frame(dataWide) # drop any cases that have missing peruncod dataWide <- dataWide[!is.na(dataWide$n0),] ########################################################## ## Try Add-One Smoothing (Jurafsky and Martin Ch 6.3) ########################################################## N <- apply(dataWide[,6:ncol(dataWide)], 1, sum) V <- 57 dataWide.smooth <- dataWide dataWide.smooth[,6:ncol(dataWide.smooth)] <- (dataWide.smooth[,6:ncol(dataWide.smooth)]+1) * N / (N + V) rile.index.left <- names(dataWide.smooth) %in% paste("n", c(103,105:107,202,403,404,406,412,413,504,506,701), sep="") rile.index.right <- names(dataWide.smooth) %in% paste("n", c(104,201,203,305,401,402,407,414,505,601,603,605,606), sep="") rile.raw <- (apply(dataWide[,rile.index.right],1,sum) - apply(dataWide[,rile.index.left],1,sum)) / N * 100 rile.smooth <- (apply(dataWide.smooth[,rile.index.right],1,sum) - apply(dataWide.smooth[,rile.index.left],1,sum)) / N * 100 plot(dataWide[,"n501"]/N*100, dataWide.smooth[,"n501"]/N*100+1, log="xy", cex=.8, xlim=c(1,70), ylim=c(1,70), xlab="Unsmoothed Environment PER501", ylab="Add-One Smoothed Environment PER501") points(0:70,0:70,type="l", lty=2, col="gray10") # set or obtain important parameters for simulation draws nman <- nrow(dataWide) # number of manifestos in dataset nrepl <- 1000 # number of replications required ## create the simulation matrixes to be filled in ## note: first row "0" of nrepl simulations is ACTUAL CMP value manifBSn <- manifBSnRand <- array(as.matrix(dataWide[,6:62]), c(nman, 57, nrepl+1), dimnames=list(1:nman, names(dataWide[,6:62]), 0:nrepl)) ########################################################## ## bootstrap the manifesto counts and compute percentages ## (uses a multinomial draw) ########################################################## require(combinat) # take totals from replication "0" which is actual manifesto n <- apply(manifBSn[1:nrow(manifBSn),,1],1,sum) # take given probabilities from actual manifesto p <- manifBSn[,,1]/n # MAIN RESAMPLING LOOP - Fixed N cat("Resample number: ") for (i in 1:nrepl) { cat(i," ") manifBSn[,,i] <- rmultinomial(n, p) } cat("\n") ########################################################## ## Compute quantities from simulations ########################################################## # RILE rileBS <- ((manifBSn[,"n104",] + # right categories of Rile manifBSn[,"n201",] + manifBSn[,"n203",] + manifBSn[,"n305",] + manifBSn[,"n401",] + manifBSn[,"n402",] + manifBSn[,"n407",] + manifBSn[,"n414",] + manifBSn[,"n505",] + manifBSn[,"n601",] + manifBSn[,"n603",] + manifBSn[,"n605",] + manifBSn[,"n606",]) - (manifBSn[,"n103",] + # left categories of Rile manifBSn[,"n105",] + manifBSn[,"n106",] + manifBSn[,"n107",] + manifBSn[,"n403",] + manifBSn[,"n404",] + manifBSn[,"n406",] + manifBSn[,"n412",] + manifBSn[,"n413",] + manifBSn[,"n504",] + manifBSn[,"n506",] + manifBSn[,"n701",] + manifBSn[,"n202",])) /n*100 # convert to percentage rileSE <- apply(rileBS,1,sd) rileMean <- apply(rileBS,1,mean) # env (PER501) envMean <- apply(manifBSn[,"n501",]/n*100, 1, mean) #### per101:per110 per101SE <- apply(manifBSn[,"n101",]/n*100, 1, sd) per102SE <- apply(manifBSn[,"n102",]/n*100, 1, sd) per103SE <- apply(manifBSn[,"n103",]/n*100, 1, sd) per104SE <- apply(manifBSn[,"n104",]/n*100, 1, sd) per105SE <- apply(manifBSn[,"n105",]/n*100, 1, sd) per106SE <- apply(manifBSn[,"n106",]/n*100, 1, sd) per107SE <- apply(manifBSn[,"n107",]/n*100, 1, sd) per108SE <- apply(manifBSn[,"n108",]/n*100, 1, sd) per109SE <- apply(manifBSn[,"n109",]/n*100, 1, sd) per110SE <- apply(manifBSn[,"n110",]/n*100, 1, sd) #### per201:per204 per201SE <- apply(manifBSn[,"n201",]/n*100, 1, sd) per202SE <- apply(manifBSn[,"n202",]/n*100, 1, sd) per203SE <- apply(manifBSn[,"n203",]/n*100, 1, sd) per204SE <- apply(manifBSn[,"n204",]/n*100, 1, sd) #### per301:per305 per301SE <- apply(manifBSn[,"n301",]/n*100, 1, sd) per302SE <- apply(manifBSn[,"n302",]/n*100, 1, sd) per303SE <- apply(manifBSn[,"n303",]/n*100, 1, sd) per304SE <- apply(manifBSn[,"n304",]/n*100, 1, sd) per305SE <- apply(manifBSn[,"n305",]/n*100, 1, sd) #### per401:per416 per401SE <- apply(manifBSn[,"n401",]/n*100, 1, sd) per402SE <- apply(manifBSn[,"n402",]/n*100, 1, sd) per403SE <- apply(manifBSn[,"n403",]/n*100, 1, sd) per404SE <- apply(manifBSn[,"n404",]/n*100, 1, sd) per405SE <- apply(manifBSn[,"n405",]/n*100, 1, sd) per406SE <- apply(manifBSn[,"n406",]/n*100, 1, sd) per407SE <- apply(manifBSn[,"n407",]/n*100, 1, sd) per408SE <- apply(manifBSn[,"n408",]/n*100, 1, sd) per409SE <- apply(manifBSn[,"n409",]/n*100, 1, sd) per410SE <- apply(manifBSn[,"n410",]/n*100, 1, sd) per411SE <- apply(manifBSn[,"n411",]/n*100, 1, sd) per412SE <- apply(manifBSn[,"n412",]/n*100, 1, sd) per413SE <- apply(manifBSn[,"n413",]/n*100, 1, sd) per414SE <- apply(manifBSn[,"n414",]/n*100, 1, sd) per415SE <- apply(manifBSn[,"n415",]/n*100, 1, sd) per416SE <- apply(manifBSn[,"n416",]/n*100, 1, sd) #### per501:per507 per501SE <- apply(manifBSn[,"n501",]/n*100, 1, sd) per502SE <- apply(manifBSn[,"n502",]/n*100, 1, sd) per503SE <- apply(manifBSn[,"n503",]/n*100, 1, sd) per504SE <- apply(manifBSn[,"n504",]/n*100, 1, sd) per505SE <- apply(manifBSn[,"n505",]/n*100, 1, sd) per506SE <- apply(manifBSn[,"n506",]/n*100, 1, sd) per507SE <- apply(manifBSn[,"n507",]/n*100, 1, sd) #### per601:per608 per601SE <- apply(manifBSn[,"n601",]/n*100, 1, sd) per602SE <- apply(manifBSn[,"n602",]/n*100, 1, sd) per603SE <- apply(manifBSn[,"n603",]/n*100, 1, sd) per604SE <- apply(manifBSn[,"n604",]/n*100, 1, sd) per605SE <- apply(manifBSn[,"n605",]/n*100, 1, sd) per606SE <- apply(manifBSn[,"n606",]/n*100, 1, sd) per607SE <- apply(manifBSn[,"n607",]/n*100, 1, sd) per608SE <- apply(manifBSn[,"n608",]/n*100, 1, sd) #### per701:per706 per701SE <- apply(manifBSn[,"n701",]/n*100, 1, sd) per702SE <- apply(manifBSn[,"n702",]/n*100, 1, sd) per703SE <- apply(manifBSn[,"n703",]/n*100, 1, sd) per704SE <- apply(manifBSn[,"n704",]/n*100, 1, sd) per705SE <- apply(manifBSn[,"n705",]/n*100, 1, sd) per706SE <- apply(manifBSn[,"n706",]/n*100, 1, sd) ######################################## ## SAVE DATA, WRITE OUT STATA DATASET ######################################## save.image(file="BLM_CMP_uncertainty.Rdata", safe=FALSE) dataBSrile <- data.frame(cbind(dataWide[,1:5], per101SE, per102SE, per103SE, per104SE, per105SE, per106SE, per107SE, per108SE, per109SE, per110SE, per201SE, per202SE, per203SE, per204SE, per301SE, per302SE, per303SE, per304SE, per305SE, per401SE, per402SE, per403SE, per404SE, per405SE, per406SE, per407SE, per408SE, per409SE, per410SE, per411SE, per412SE, per413SE, per414SE, per415SE, per416SE, per501SE, per502SE, per503SE, per504SE, per505SE, per506SE, per507SE, per601SE, per602SE, per603SE, per604SE, per605SE, per606SE, per607SE, per608SE, per701SE, per702SE, per703SE, per704SE, per705SE, per706SE, rileSE, rileMean, envMean)) write.dta(dataBSrile, "BLM_CMP_uncertainty.dta", version=7) ############################################# ### ### 2. REPLICATING RESULTS IN THE ARTICLE ### ############################################# ############## ## GRAPHICS ## ############## ########## Figure 2: Movement on environmental policy of German CDU-CSU over time attach(dataBSrile) quartz() envCIlo = envMean - 2*per501SE envCIhi = envMean + 2*per501SE df <- data.frame(cmp.party, envMean, envCIlo, envCIhi, cmp.edate) df <- df[cmp.party==41521,] par(xaxt="n") plot(df$cmp.edate, df$envMean, type="b", lty="dashed", bty="n", ylab="CDU-CSU Position on CMP Environment Dimension (PER501)", ylim=c(-5,25)) errbar(df$cmp.edate, df$envMean, df$envCIlo, df$envCIhi, add=TRUE, cap=.01) par(xaxt="s") axis(1, at=df$cmp.edate, substring(as.Date(df$cmp.edate),1,4), las=2) ########## Figure 3: Left-Right placement of the major French parties in 2002. quartz() par(mfrow=c(1,1),cex.axis=1.2,cex.lab=1.4,cex=1,cex.main=1) rileCIlo = rileMean - 2*rileSE rileCIhi = rileMean + 2*rileSE df <- data.frame(cmp.party, rileMean, rileCIlo, rileCIhi, cmp.edate) df2 <- df[cmp.country==31 & cmp.edate=="2002-06-09",] df2 <- df2[order(df2$rileMean),] par(yaxt="n", mar=c(5, 3, 1, 2) + 0.1) errbar(rep("",6), df2$rileMean, df2$rileCIlo, df2$rileCIhi, ylab="CMP Right-Left Dimension") text(df2$rileMean, (1:6)+.2, c("PCF","PS","Verts","UMP","UDF","FN"), cex=1.2) ########## TABLE 1 CHANGE OVER TIME ## ## /****** WITHOUT RANDOM QUASI-SENTENCES *************/ ## // by party (edate): gen rilePrev = rile[_n-1] ## // by party (edate): gen rilePrevSE = rileSE[_n-1] ## by party (edate): gen rileChange = rile - rile[_n-1] ## by party (edate): gen rileChangeSE = sqrt(rileSE^2 + rileSE[_n-1]^2) ## gen sigChange = abs(rileChange) > 1.96*rileChangeSE ## replace sigChange=. if rileChange==. ## by party (edate): gen edatePrev = edate[_n-1] ## format edatePrev %dD_m_Y ## set more off ## log using changesOverTime.log, text replace ## tab sigChange, missing ## tab sigChange ########## TABLE 2. Results of SIMEX error correction ### in Adams, Clark, Ezrow & Glasgow (2006) ### require(simex) #load replication dataset adamsdata <- read.dta("Adams_etal_replication.dta") attach(adamsdata) ###### Adams et al (2006): Model 1 # OLS estimate of Adams et al (2006): Model 1 adams.m1.ols <- lm(pshift2~ vshift + idparty + idvshift + votec1 + pshiftt12 + pvoteshift + Italy + Britain + Greece + Luxembourg + Denmark + Netherlands + Spain, data=adamsdata, x=TRUE) summary(adams.m1.ols) #specify estimation sample complete<-complete.cases(pshift2,vshift,idparty,idvshift, votec1,pshiftt12,pvoteshift, Italy,Britain,Greece, Luxembourg,Denmark,Netherlands, Spain) #Error correction in three variables: "pshift2", # "pshiftt12", # "pvoteshift". #Estimates of standard error around the CMP #derived variables are the in-sample mean of bootstrapped #standard errors. #SE for previous policy shift and its interaction with #previous change in vote share is assumed to be the same. DVerror <- mean(pshift2SE[complete]) LDVerror <- mean(pshiftt12SE[complete]) LDVINTerror<- mean(pshiftt12SE[complete]) # SIMEX error correction of linear model (adams.m1.ols) # with three error contaminated variables adams.m1.simex <- simex(model = adams.m1.ols, SIMEXvariable = c("pshift2", "pshiftt12","pvoteshift"), B=500, measurement.error = c(DVerror, LDVerror,LDVINTerror)) summary(adams.m1.simex) ######################################### # "manual" calculation of GOF statistic ######################################### #yhat SIMEX yhat.adams.m1.simex<-predict.SIMEX(adams.m1.simex, newdata=adamsdata) k<-length(coef(adams.m1.simex)) n <- length(fitted(adams.m1.simex)) #total sum of squares SIMEX tss.adams.m1.simex<- sum((pshift2[complete]- mean(pshift2[complete]))^2) #sum of squared errors SIMEX sse.adams.m1.simex<-sum((pshift2[complete]- yhat.adams.m1.simex[complete])^2) #R-squared SIMEX r2.adams.m1.simex<-1-sse.adams.m1.simex/tss.adams.m1.simex #yhat OLS yhat.adams.m1.ols<-predict.lm(adams.m1.ols, newdata=adamsdata) #sum of squared errors OLS sse.adams.m1.ols<-sum((pshift2[complete]- yhat.adams.m1.ols[complete])^2) #RMSE OLS rootMSE.adams.m1.ols<-sqrt(sse.adams.m1.ols/n) #RMSE SIMEX rootMSE.adams.m1.simex<-sqrt(sse.adams.m1.simex/n) ###### Adams et al (2006): Model 2 adamsdata2 <- adamsdata[adamsdata$dirpshift!=0,] attach(adamsdata2) # OLS estimate of Adams et al (2006): Model 2 adams.m2.ols <- lm(votec~ votec1+ modshft+ extshft+ idparty+idmod +idext+ dirvshift+ ingovnow+coalgov+ changeunemp3+ changegdp3+ cgovunemp+ cgovgdp+ converge+ wing+ wingconv+ Italy+ Britain+ Greece+ Luxembourg+ Denmark+ Netherlands+ Spain, x=TRUE) summary(adams.m2.ols) #specify estimation sample nonmissing <- complete.cases(votec,votec1, modshft, extshft, idparty, idmod, idext, dirvshift, ingovnow, coalgov, changeunemp3, changegdp3, cgovunemp, cgovgdp, converge, wing, wingconv,Italy, Britain, Greece, Luxembourg, Denmark, Netherlands, Spain) #Error correction in six variables: "modshft", # "extshft", # "converge", # "idmod", # "idext", # "wingconv". #Estimates of standard error around the CMP #derived variables are the in-sample mean of bootstrapped #standard errors. MODSHFTerror <- mean(modshftSE[nonmissing]) EXTSHFTerror <- mean(extshftSE[nonmissing]) CONVERGEerror <- mean(convergeSE[nonmissing]) IDMODerror <- mean(idmodSE[nonmissing]) IDEXTerror <- mean(idextSE[nonmissing]) WINGCONVerror <- mean(wingconvSE[nonmissing]) # SIMEX error correction of linear model (adams.m2.ols) # with six error contaminated variables adams.m2.simex <- simex(model = adams.m2.ols, SIMEXvariable = c("modshft", "extshft", "converge","idmod", "idext", "wingconv"), B=500, measurement.error=c(MODSHFTerror, EXTSHFTerror,CONVERGEerror, IDMODerror,IDEXTerror, WINGCONVerror)) summary(adams.m2.simex) ######################################### # "manual" calculation of GOF statistic ######################################### #yhat SIMEX yhat.adams.m2.simex<-predict.SIMEX(adams.m2.simex, newdata=adamsdata2) k<-length(coef(adams.m2.simex)) n <- length(fitted(adams.m2.simex)) #total sum of squares SIMEX tss.adams.m2.simex<- sum((votec[nonmissing]- mean(votec[nonmissing]))^2) #sum of squared errors SIMEX sse.adams.m2.simex <- sum((votec[nonmissing]- yhat.adams.m2.simex[nonmissing] )^2) #R-squared SIMEX r2.adams.m2.simex<-1-sse.adams.m2.simex/tss.adams.m2.simex #yhat OLS yhat.adams.m2.ols<-predict.lm(adams.m2.ols, newdata=adamsdata2) #sum of squared errors OLS sse.adams.m2.ols<-sum((votec[nonmissing]- yhat.adams.m2.ols[nonmissing])^2) #RMSE OLS rootMSE.adams.m2.ols<-sqrt(sse.adams.m2.ols/n) #RMSE SIMEX rootMSE.adams.m2.simex<-sqrt(sse.adams.m2.simex/n) detach(adamsdata2) ########## TABLE 3. Results of SIMEX error correction ### in Hix, Noury, & Roland (2006) ### require(simex) #load replication dataset hixdata <- read.dta("Hix_etal_replication.dta") attach(hixdata) ###### OLS estimate Hix et al (2006):Model 3 Table 4 hnr.m3dim1.ols <- lm(dim1 ~ cmp_soclr + cmp_econlr + cmp_eu + comm + govt + ep2 + ep3 + ep4 + ep5, data=hixdata, x=TRUE) summary(hnr.m3dim1.ols) #specify estimation sample sample3<-complete.cases(dim1,cmp_soclr,cmp_econlr, cmp_eu,comm,govt,ep2,ep3,ep4,ep5) #Error correction in three variables: "cmp_soclr", # "cmp_econlr", # "cmp_eu". #Estimates of standard error around the CMP #derived variables are the in-sample mean of bootstrapped #standard errors. #SE for EU Integration dimension is doubled to reflect #extremely low frequency of the CMP scores. EUerror <- 2*mean(euSE[sample3]) SOCerror <- mean(SocSE[sample3]) ECONerror <- mean(EconSE[sample3]) # SIMEX error correction of linear model (hnr.m3dim1.ols) # with three error contaminated variables hnr.m3dim1.simex <- simex(model = hnr.m3dim1.ols, SIMEXvariable = c("cmp_soclr", "cmp_econlr", "cmp_eu"), B=500, measurement.error = c(SOCerror, ECONerror, EUerror)) summary(hnr.m3dim1.simex) ######################################### # "manual" calculation of GOF statistic ######################################### #yhat SIMEX yhat.hnr.m3dim1.simex<-predict.SIMEX(hnr.m3dim1.simex, newdata=hixdata) k<-length(coef(hnr.m3dim1.simex)) n <- length(fitted(hnr.m3dim1.simex)) #total sum of squares SIMEX tss.hnr.m3dim1.simex<- sum((dim1[sample3]- mean(dim1[sample3]))^2) #sum of squared errors SIMEX sse.hnr.m3dim1.simex<-sum((dim1[sample3]- yhat.hnr.m3dim1.simex[sample3] )^2) #R-squared SIMEX r2.hnr.m3dim1.simex<-1-sse.hnr.m3dim1.simex/tss.hnr.m3dim1.simex #yhat OLS yhat.hnr.m3dim1.ols<-predict.lm(hnr.m3dim1.ols, newdata=hixdata) #sum of squared errors OLS sse.hnr.m3dim1.ols<-sum((dim1[sample3]- yhat.hnr.m3dim1.ols[sample3])^2) #RMSE OLS rootMSE.hnr.m3dim1.ols<-sqrt(sse.hnr.m3dim1.ols/n) #RMSE SIMEX rootMSE.hnr.m3dim1.simex<-sqrt(sse.hnr.m3dim1.simex/n) ###### OLS estimate Hix et al (2006):Model 6 Table 4 hnr.m6dim1.ols <- lm(dim1 ~ cmp_soclr + cmp_econlr + cmp_eu +comm + govt + ep2 + ep3+ ep4 + ep5+soc + lsoc + lib + grn+ con + left +gaul + na + reg+ right, data=hixdata, x=TRUE) summary(hnr.m6dim1.ols) #specify estimation sample sample6<-complete.cases(dim1,cmp_soclr,cmp_econlr,cmp_eu, comm,govt,ep2,ep3,ep4,ep5,soc, lsoc,lib,grn,con,left,gaul,na,reg, right) #Error correction in three variables: "cmp_soclr", # "cmp_econlr", # "cmp_eu". #Estimates of standard error around the CMP #derived variables are the in-sample mean of bootstrapped #standard errors. #SE for EU Integration dimension is doubled to reflect #extremely low frequency of the CMP scores. EUerror <- 2*mean(euSE[sample6]) SOCerror <- mean(SocSE[sample6]) ECONerror <- mean(EconSE[sample6]) # SIMEX error correction of linear model (hnr.m6dim1.ols) # with three error contaminated variables hnr.m6dim1.simex <- simex(model = hnr.m6dim1.ols, SIMEXvariable = c("cmp_soclr", "cmp_econlr", "cmp_eu"), B=500, measurement.error = c(SOCerror, ECONerror, EUerror)) summary(hnr.m6dim1.simex) ######################################### # "manual" calculation of GOF statistic ######################################### #yhat SIMEX yhat.hnr.m6dim1.simex<-predict.SIMEX(hnr.m6dim1.simex, newdata=hixdata) k<-length(coef(hnr.m6dim1.simex)) n <- length(fitted(hnr.m6dim1.simex)) #total sum of squares SIMEX tss.hnr.m6dim1.simex<- sum((dim1[sample6]- mean(dim1[sample6]))^2) #sum of squared errors SIMEX sse.hnr.m6dim1.simex<-sum((dim1[sample6]- yhat.hnr.m6dim1.simex[sample6]) ^2) #R-squared SIMEX r2.hnr.m6dim1.simex<-1-sse.hnr.m6dim1.simex/tss.hnr.m6dim1.simex #yhat OLS yhat.hnr.m6dim1.ols<-predict.lm(hnr.m6dim1.ols) #sum of squared errors OLS sse.hnr.m6dim1.ols<-sum((dim1[sample6]- yhat.hnr.m6dim1.ols[sample6])^2, na.rm=TRUE) #RMSE OLS rootMSE.hnr.m6dim1.ols<-sqrt(sse.hnr.m6dim1.ols/n) #RMSE SIMEX rootMSE.hnr.m6dim1.simex<-sqrt(sse.hnr.m6dim1.simex/n) ########## Figure 4: SIMEX error correction in EU Integration with quadratic and nonlinear extrapolant functions, fromHix,Noury,andRoland(2006) plot.SIMEX2 <- function (x, xlab = expression((1 + lambda)), ylab = colnames(b[, -1]), ask = FALSE, show = rep(TRUE, NCOL(b) - 1), ...) { old.par <- par(no.readonly = TRUE) on.exit(par(old.par)) par(...) if (ask) par(ask = TRUE) p.names <- names(coef(x)) b <- x$SIMEX.estimates a <- seq(-1, max(b[, 1]), by = 0.01) d <- matrix(data = NA, nrow = length(a), ncol = NCOL(b) - 1) switch(x$fitting.method, quad = d <- predict(x$extrapolation, newdata = data.frame(lambda = a)), line = d <- predict(x$extrapolation, newdata = data.frame(lambda = a)), nonl = for (i in 1:length(p.names)) d[, i] <- predict(x$extrapolation[[p.names[i]]], newdata = data.frame(lambda = a)), log2 = for (i in 1:length(p.names)) d[, i] <- predict(x$extrapolation[[p.names[i]]], newdata = data.frame(lambda = a)) - ((abs(apply(x$SIMEX.estimates[-1, -1], 2, min)) + 1) * (apply(x$SIMEX.estimates[-1, -1], 2, min) <= 0))[i], logl = d <- t(t(exp(predict(x$extrapolation, newdata = data.frame(lambda = a)))) - ((abs(apply(x$SIMEX.estimates[-1, -1], 2, min)) + 1) * (apply(x$SIMEX.estimates[-1, -1], 2, min) <= 0)))) for (i in 2:NCOL(b)) { if (show[i - 1]) { points(b[-1, 1] + 1, b[-1, i], pch = 19) points(b[1, 1] + 1, b[1, i]) lines(a[a > 0] + 1, d[a > 0, (i - 1)]) lines(a[a < 0] + 1, d[a < 0, (i - 1)], lty = 2) } } } quartz(width=8,height=8) x <- hnr.m6dim1.simex par( mar=c(5, 5, 2, 2)+0.3) plot(NA,NA, xlim=c(0,3), ylim=c(0,0.014), cex.lab=1.3, cex.axis=1.2, ylab="EU Integration", xlab=expression((1 + zeta))) plot.SIMEX2(x, show=c(F,F,F,T, rep(F,ncol(x[[2]])-1))) plot.SIMEX2(refit(x,"nonl"), show=c(F,F,F,T, rep(F,ncol(x[[2]])-1))) text(.05, .009, "Quadratic Correction", pos=4, cex=1.2) text(.05, .013, "Non-linear Correction", pos=4, cex=1.2) text(1.05, 0.0041, "Uncorrected Estimate", pos=4, cex=1.2) detach(hixdata) ############################### #### Commands for merging ############################### CMP.original <- read.dta("mds2005f.dta", convert.factors=FALSE) CMP.BLMerror <- read.dta("BLM_CMP_uncertainty.dta") # convert CMP's edate to numeric so can be matched to BLM file # and harmonize the names of the election date variable library(date) CMP.BLMerror$edate <- as.date(CMP.BLMerror$cmp_edate) # harmonize name of the party variable names(CMP.original)[which(names(CMP.original)=="party")] <- "cmp_party" # merge the files CMP.merged <- merge(CMP.original, CMP.BLMerror, c("cmp_party","edate"))