In this post, we will look at two different ways to compute the cross-validation error and two different ways to compute the test error in R linear regression. The data set we will use is the gala set in package faraway.
> library(faraway)
> data(gala)
> index<-1:nrow(gala)
> testindex<-sample(index,trunc(length(index)/3))
testset<-gala[testindex,]
trainset<-gala[-testindex,]
c1<-c(1:nn)
> trainset2<-cbind(trainset,c1)
> diff1<-numeric(nn)
> foo1<-numeric(nn)
> foo2<-numeric(nn)
> diff2<-numeric(nn)
> #Cross validation using predict in two different ways:(a) the data points are explicitly defined using a list() (b) the data points are implicitly given by deleting the output column (column 1).
> newdata<-trainset[,-5:-7]
> for(i in 1:nn){model1<-lm(Species~Endemics+Area+Elevation,subset=(c1!=i),data=trainset2)
+ foo1[i]<-predict(model1,list(Endemics=trainset2[i,2],Area=trainset2[i,3],Elevation=trainset2[i,
+ 4]))
+ foo2[i]<-predict(model1,newdata[i,-1])
+ diff1[i]<-trainset2[i,1]-foo1[i]
+ diff2[i]<-trainset2[i,1]-foo2[i]
+ }
#As we see below, the two ways are equivalent.
> diff1
[1] 9.9654289 -0.4143694 -14.5205954 47.9751592 8.1244225
[6] 14.0009744 -2.5409240 30.3767094 -5.6358873 12.5939696
[11] -34.3419570 -428.6932984 9.7597007 -25.9936027 -13.6063039
[16] -12.0757886 86.7751835 -27.2387452 -10.6432398 -9.8346438
> diff2
[1] 9.9654289 -0.4143694 -14.5205954 47.9751592 8.1244225
[6] 14.0009744 -2.5409240 30.3767094 -5.6358873 12.5939696
[11] -34.3419570 -428.6932984 9.7597007 -25.9936027 -13.6063039
[16] -12.0757886 86.7751835 -27.2387452 -10.6432398 -9.8346438
> sum11=0
> for(i in 1:nn) {
+ sum11=sum11+diff1[i]^2
+ }
> sum11<-sum11/nn
#The number below is the Mean Square Error.
> sum11
[1] 9926.731
#Computation of the test error in 2 different ways: (a) Using the predict() function to compute the predicted values (b) Use the coefficients of the regression equation to compute the predicted values.
> tt<-nrow(testset)
> diff3<-numeric(tt)
> model12<-lm(Species~Endemics+Area+Elevation,data=trainset)
> difftest<-numeric(tt)
> newdata2<-testset[,-5:-7]
#Using the predict() function for the computation of the predicted values.
> lmpred<-predict(model2,newdata2[,-1])
> for(i in 1:tt) {
+ difftest[i]=testset[i,1]-lmpred[i]}
> summ2=0
> for(i in 1:tt) {
+ summ2=summ2+difftest[i]^2}
> summ2<-summ2/tt
> summ2
#This is the mean squared test error.
[1] 1091.469
#Now let’s proceed to the second way of computing the test error.
> cc<-as.matrix(cbind(rep(1,10),testset[,c(2,3,4)]))
> pred.value<-cc%*% model2$coef
> for(i in 1:tt) {
+ difftest[i]<-testset[i,1]-pred.value[i]}
> sum3=0
> for(i in 1:tt) {
+ sum3=sum3+difftest[i]^2}
> sum3<-sum3/tt
> sum3
[1] 1091.469
#As expected the two different methods of computing the test error, yield the same numbers.