# Let's load the data
library(dplyr)
library(ggplot2)
kc_orig<-read.csv("kc_house_data.csv")
kc<-kc_orig
names(kc)
head(kc)
dim(kc)

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


kc2<-kc[,-c(1,2,13:21)]
kc2$price<-log10(kc$price)               # basic transformations of the data
#
names(kc2)
head(kc2)
# Note we have categorical/ordinal variables here for grade and condition
# These may not always be well suited to be treated as numerical in which case you may want to 
# encode these as dummy variables.


library(GGally)
ggpairs(data=kc2,
        axisLabels="show")

Warning message:
"package 'GGally' was built under R version 4.1.3"
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2


ggpairs(data=kc2,columns=c(2,3,4,1)) #  looks like we should try a transformation on sqft_living
                                    #  and a huge outlier in bedrooms (30 bedrooms?)


kc3 <- kc2
kc3 <- kc2[kc2$bedroom<20,]
kc3$sqft_living <- sqrt(kc3$sqft_living)
ggpairs(data=kc3,columns=c(2,3,4,1)) #


ggpairs(data=kc3,columns=c(5,6,7,1)) #  
ggpairs(data=kc3,columns=c(8,9,10,1))


kc4 <- kc3
kc4$sqft_lot <- sqrt(kc3$sqft_lot) # compress the size of the lot scale
kc4$view[kc4$view>1] <- 1          # for simplicity use view as binary variable (has view or not)
# other ordingal look quite good.


mm <- lm(price~., data=kc4)
summary(mm)

Call:
lm(formula = price ~ ., data = kc4)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.53332 -0.10273  0.00485  0.09811  0.60416 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.530e+00  9.710e-03 466.569  < 2e-16 ***
bedrooms    -1.257e-02  1.459e-03  -8.613  < 2e-16 ***
bathrooms   -4.668e-03  2.197e-03  -2.125  0.03361 *  
sqft_living  9.551e-03  2.233e-04  42.770  < 2e-16 ***
sqft_lot    -1.630e-04  1.519e-05 -10.727  < 2e-16 ***
floors       6.576e-03  2.280e-03   2.884  0.00393 ** 
waterfront   1.958e-01  1.189e-02  16.466  < 2e-16 ***
view         8.894e-02  3.598e-03  24.718  < 2e-16 ***
condition    4.171e-02  1.591e-03  26.216  < 2e-16 ***
grade        8.056e-02  1.410e-03  57.139  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1455 on 21602 degrees of freedom
Multiple R-squared:  0.5953,	Adjusted R-squared:  0.5951 
F-statistic:  3531 on 9 and 21602 DF,  p-value: < 2.2e-16


custom_function = function(data, mapping, method1 = "loess", method2= "lm", ...){
      p = ggplot(data = data, mapping = mapping) + 
      geom_point() + 
      geom_smooth(method=method1, color="blue")+
      geom_smooth(method=method2, color="red")
      p
    }
#
options(warn=-1)
ggpairs(data=kc4[sample(seq(1,dim(kc4)[1]),5000),c(2,3,4,5,1)],
        axisLabels="show",
        lower = list(continuous = custom_function))
ggpairs(data=kc4[sample(seq(1,dim(kc4)[1]),5000),c(6,7,8,9,10,1)],
        axisLabels="show",
        lower = list(continuous = custom_function))

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'


# Note, here I just to compare run times over dimensions and not the approximations 
# Try this on low rank matrices at home
library(tictoc)
library(rsvd)
#
X<-matrix(rnorm(1000000),10000,100)
tic()
s <- rsvd(X,k=10,p=2,q=1) # askking for a higher rank k means you get closer to svd computation...
toc()

tic()
s <- svd(X)
toc()

0.05 sec elapsed
0.21 sec elapsed


options(warn=0)
library(rsvd) # a package for randomized projections.
###
standardize <- function(x) {x <- (x-mean(x))/sd(x)}
kc5 <- kc4
kc5[,-1] <- as.matrix(apply(kc4[,-1],2,standardize))
#
tic()
ss <- svd(kc5[,-1]) # 
toc()
#
tic()
ssr <- rsvd(as.matrix(kc5[,-1]),p=5,q=1) # 
toc()
#
plot(ss$u[,1],ssr$u[,1],xlab="PC1",ylab="rPC1")

0 sec elapsed
0.05 sec elapsed


### Leveraging
library(rsvd)
ss<-rsvd(as.matrix(kc5[,-1])) # excluding the response variable
lev<-apply(ss$u^2,1,sum) # leverage from the SVD of X
plot(lev,type="h",xlab="observations",ylab="leverage")
abline(h=2*(dim(kc5)[2]-1)/(dim(kc5)[1]),col="darkorange",lty=2,lwd=2)
mm0<-lm(price~., data=kc5)
hat<-hatvalues(mm0) # from the regression fit
plot(hat,type="h",xlab="observations",ylab="leverage")
abline(h=2*(dim(kc5)[2]-1)/(dim(kc5)[1]),col="darkorange",lty=2,lwd=2)
###
print(paste("Nbr of high leverage observations = ", length(lev[lev>2*(dim(kc5)[2]-1)/(dim(kc5)[1])])))

[1] "Nbr of high leverage observations =  1667"


set.seed(1012)
hist(lev,500)
# sampling 500 observations with different probabilities given by leverage
# so we oversample influential observations
probs<-lev*500/sum(lev) # effective sample size 500
probs[probs>1]<-1
pp<-rbinom(dim(kc5)[1],prob=probs,size=1)
sum(pp) # actual sample size


custom_function = function(data, mapping, method1 = "loess", method2= "lm", ...){
      p = ggplot(data = data, mapping = mapping) + 
      geom_point() + 
      #geom_smooth(method=method1, color="blue")+
      geom_smooth(method=method2, color="red")
      p
    }
#
set.seed(2021)
options(warn=-1)
kc6a <- as.data.frame(kc5[sample(seq(1,dim(kc5)[1]),500),])
names(kc6a)<-names(kc5)
ggpairs(data=kc6a[,c(2,3,4,5,1)], # random samples
        axisLabels="show",
        lower = list(continuous = custom_function))
kc6b <- as.data.frame(kc5[pp==1,])
names(kc6b)<-names(kc5)
ggpairs(data=kc6b[,c(2,3,4,5,1)],                             # high leverage samples
        axisLabels="show",
        lower = list(continuous = custom_function))

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'


mm6a <- lm(price~., data=kc6a)
summary(mm6a)
mm6b <- lm(price~., data=kc6b)
summary(mm6b)

Call:
lm(formula = price ~ ., data = kc6a)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.41494 -0.10224  0.00609  0.08973  0.40157 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  5.678273   0.006521 870.715  < 2e-16 ***
bedrooms    -0.004323   0.008272  -0.523  0.60146    
bathrooms   -0.001420   0.010843  -0.131  0.89589    
sqft_living  0.079287   0.013336   5.945 5.25e-09 ***
sqft_lot    -0.021486   0.007645  -2.811  0.00514 ** 
floors      -0.009739   0.007841  -1.242  0.21481    
waterfront   0.024909   0.005471   4.553 6.69e-06 ***
view         0.020775   0.006782   3.063  0.00231 ** 
condition    0.018356   0.006406   2.865  0.00434 ** 
grade        0.093523   0.011202   8.349 7.11e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.144 on 490 degrees of freedom
Multiple R-squared:  0.541,	Adjusted R-squared:  0.5325 
F-statistic: 64.16 on 9 and 490 DF,  p-value: < 2.2e-16

Call:
lm(formula = price ~ ., data = kc6b)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.48596 -0.09851  0.00633  0.09034  0.44992 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  5.681164   0.007839 724.689  < 2e-16 ***
bedrooms     0.002282   0.006541   0.349    0.727    
bathrooms   -0.004929   0.009857  -0.500    0.617    
sqft_living  0.079341   0.012336   6.431 2.93e-10 ***
sqft_lot    -0.003748   0.003253  -1.152    0.250    
floors       0.001586   0.007578   0.209    0.834    
waterfront   0.017040   0.002270   7.506 2.77e-13 ***
view         0.024881   0.005513   4.513 7.95e-06 ***
condition    0.026815   0.006193   4.330 1.80e-05 ***
grade        0.096158   0.009621   9.994  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1546 on 505 degrees of freedom
Multiple R-squared:  0.7132,	Adjusted R-squared:  0.7081 
F-statistic: 139.5 on 9 and 505 DF,  p-value: < 2.2e-16


#####################################
# run again but on 10 times as much data... which I artificially generate by oversampling just for illustration 
kc7<-kc5[sample(seq(1,dim(kc5)[1]),200000,replace=T),]
#
ss<-rsvd(as.matrix(kc7[,-1])) # excluding the response variable
lev<-apply(ss$u^2,1,sum)
probs<-lev*2000/sum(lev) # effective size 2000
probs[probs>1]<-1 
pp<-rbinom(dim(kc7)[1],prob=probs,size=1)
#########
mm1<-lm(price~sqft_living+bedrooms+condition+grade,data=kc7)
par(mfrow=c(2,2))
plot(mm1) # takes forever....
summary(mm1)
###
# Leverage results
mm2<-lm(price~sqft_living+bedrooms+condition+grade,
       data=kc7,subset=seq(1,dim(kc7)[1])[pp==1])
par(mfrow=c(2,2))
plot(mm2)
summary(mm2)
#
mm3<-lm(price~sqft_living+bedrooms+condition+grade,
       data=kc7,subset=sample(seq(1,dim(kc7)[1]),sum(pp)))
par(mfrow=c(2,2))
plot(mm3)
summary(mm3)

Call:
lm(formula = price ~ sqft_living + bedrooms + condition + grade, 
    data = kc7)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5113 -0.1056  0.0023  0.0996  0.6049 

Coefficients:
              Estimate Std. Error  t value Pr(>|t|)    
(Intercept)  5.6663182  0.0003345 16940.33   <2e-16 ***
sqft_living  0.0952904  0.0006295   151.38   <2e-16 ***
bedrooms    -0.0158638  0.0004393   -36.11   <2e-16 ***
condition    0.0289979  0.0003389    85.55   <2e-16 ***
grade        0.0990499  0.0005349   185.19   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1496 on 199995 degrees of freedom
Multiple R-squared:  0.5752,	Adjusted R-squared:  0.5751 
F-statistic: 6.769e+04 on 4 and 199995 DF,  p-value: < 2.2e-16

Call:
lm(formula = price ~ sqft_living + bedrooms + condition + grade, 
    data = kc7, subset = seq(1, dim(kc7)[1])[pp == 1])

Residuals:
     Min       1Q   Median       3Q      Max 
-0.51955 -0.12065 -0.00467  0.10945  0.54500 

Coefficients:
             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)  5.708633   0.004150 1375.476  < 2e-16 ***
sqft_living  0.110473   0.005579   19.802  < 2e-16 ***
bedrooms    -0.017127   0.003883   -4.411 1.08e-05 ***
condition    0.026572   0.003380    7.862 6.19e-15 ***
grade        0.094871   0.004902   19.354  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.173 on 1967 degrees of freedom
Multiple R-squared:  0.6561,	Adjusted R-squared:  0.6554 
F-statistic: 938.1 on 4 and 1967 DF,  p-value: < 2.2e-16

Call:
lm(formula = price ~ sqft_living + bedrooms + condition + grade, 
    data = kc7, subset = sample(seq(1, dim(kc7)[1]), sum(pp)))

Residuals:
     Min       1Q   Median       3Q      Max 
-0.43114 -0.10917 -0.00204  0.10181  0.45160 

Coefficients:
             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)  5.663491   0.003389 1670.935  < 2e-16 ***
sqft_living  0.104983   0.006420   16.352  < 2e-16 ***
bedrooms    -0.015905   0.004630   -3.435 0.000604 ***
condition    0.029208   0.003379    8.644  < 2e-16 ***
grade        0.094079   0.005525   17.029  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1504 on 1967 degrees of freedom
Multiple R-squared:  0.5892,	Adjusted R-squared:  0.5884 
F-statistic: 705.3 on 4 and 1967 DF,  p-value: < 2.2e-16


library(relaimpo)
library(rsvd)
library(irlba)


#help(calc.relimp) # this takes a while to run.... especially for many predictors
mm1<-lm(price~.,data=kc5)
calc.relimp(mm1, type = c("lmg","last", "first", "betasq") )

Response variable: price 
Total response variance: 0.05232176 
Analysis based on 21612 observations 

9 Regressors: 
bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade 
Proportion of variance explained by model: 59.53%
Metrics are not normalized (rela=FALSE). 

Relative importance metrics: 

                    lmg         last       first       betasq
bedrooms    0.031384699 1.389903e-03 0.123204834 0.0024882349
bathrooms   0.081633296 8.458805e-05 0.303405872 0.0002469845
sqft_living 0.182879727 3.427158e-02 0.480772201 0.1586909960
sqft_lot    0.005748653 2.155801e-03 0.018850892 0.0024589528
floors      0.024436274 1.558224e-04 0.096467800 0.0002410097
waterfront  0.012730995 5.079664e-03 0.030481758 0.0054864076
view        0.040575965 1.144690e-02 0.109389437 0.0133990840
condition   0.010477420 1.287640e-02 0.001559798 0.0140755518
grade       0.205424730 6.116636e-02 0.495138874 0.1714098788

Average coefficients for different model sizes: 

                     1X        2Xs        3Xs         4Xs          5Xs
bedrooms    0.080288814 0.04958885 0.02754854 0.012300982  0.002175660
bathrooms   0.125994956 0.09686541 0.07178574 0.050680106  0.033381985
sqft_living 0.158602798 0.14908436 0.13991322 0.131079860  0.122568315
sqft_lot    0.031405602 0.01945716 0.01041278 0.003627811 -0.001432672
floors      0.071044810 0.04672348 0.03006019 0.019067076  0.012126801
waterfront  0.039935689 0.03285172 0.02800740 0.024647124  0.022239915
view        0.075653472 0.06182347 0.05158759 0.044066043  0.038512701
condition   0.009033903 0.01688466 0.02152424 0.024148053  0.025570955
grade       0.160955078 0.14809749 0.13693899 0.127307665  0.119002839
                     6Xs          7Xs          8Xs          9Xs
bedrooms    -0.004247534 -0.008117248 -0.010306073 -0.011410032
bathrooms    0.019644903  0.009155046  0.001544726 -0.003594810
sqft_living  0.114353855  0.106403023  0.098674624  0.091120756
sqft_lot    -0.005189930 -0.007959447 -0.009963021 -0.011342695
floors       0.007957221  0.005575567  0.004266193  0.003551063
waterfront   0.020441166  0.019038622  0.017901159  0.016942801
view         0.034336926  0.031107905  0.028542668  0.026477607
condition    0.026326778  0.026740572  0.026986970  0.027137753
grade        0.111806944  0.105498164  0.099862179  0.094701988


boot <- boot.relimp(mm1, b = 50, type = c("lmg", 
                                               "last", "first"), rank = TRUE, 
                    diff = TRUE, rela = TRUE)
booteval.relimp(boot) # print result
plot(booteval.relimp(boot,sort=TRUE))

Response variable: price 
Total response variance: 0.05232176 
Analysis based on 21612 observations 

9 Regressors: 
bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade 
Proportion of variance explained by model: 59.53%
Metrics are normalized to sum to 100% (rela=TRUE). 

Relative importance metrics: 

                    lmg         last        first
bedrooms    0.052721542 0.0108056834 0.0742523674
bathrooms   0.137131574 0.0006576228 0.1828548720
sqft_living 0.307210245 0.2664415135 0.2897489716
sqft_lot    0.009656867 0.0167600911 0.0113609448
floors      0.041049239 0.0012114284 0.0581386485
waterfront  0.021386143 0.0394914251 0.0183705673
view        0.068161477 0.0889929612 0.0659261845
condition   0.017600478 0.1001065207 0.0009400501
grade       0.345082435 0.4755327538 0.2984073939

Average coefficients for different model sizes: 

                     1X        2Xs        3Xs         4Xs          5Xs
bedrooms    0.080288814 0.04958885 0.02754854 0.012300982  0.002175660
bathrooms   0.125994956 0.09686541 0.07178574 0.050680106  0.033381985
sqft_living 0.158602798 0.14908436 0.13991322 0.131079860  0.122568315
sqft_lot    0.031405602 0.01945716 0.01041278 0.003627811 -0.001432672
floors      0.071044810 0.04672348 0.03006019 0.019067076  0.012126801
waterfront  0.039935689 0.03285172 0.02800740 0.024647124  0.022239915
view        0.075653472 0.06182347 0.05158759 0.044066043  0.038512701
condition   0.009033903 0.01688466 0.02152424 0.024148053  0.025570955
grade       0.160955078 0.14809749 0.13693899 0.127307665  0.119002839
                     6Xs          7Xs          8Xs          9Xs
bedrooms    -0.004247534 -0.008117248 -0.010306073 -0.011410032
bathrooms    0.019644903  0.009155046  0.001544726 -0.003594810
sqft_living  0.114353855  0.106403023  0.098674624  0.091120756
sqft_lot    -0.005189930 -0.007959447 -0.009963021 -0.011342695
floors       0.007957221  0.005575567  0.004266193  0.003551063
waterfront   0.020441166  0.019038622  0.017901159  0.016942801
view         0.034336926  0.031107905  0.028542668  0.026477607
condition    0.026326778  0.026740572  0.026986970  0.027137753
grade        0.111806944  0.105498164  0.099862179  0.094701988

 
 Confidence interval information ( 50 bootstrap replicates, bty= perc ): 
Relative Contributions with confidence intervals: 
 
                                       Lower  Upper
                  percentage 0.95      0.95   0.95  
bedrooms.lmg      0.0527     ____E____ 0.0488 0.0562
bathrooms.lmg     0.1371     __C______ 0.1320 0.1426
sqft_living.lmg   0.3072     _B_______ 0.3018 0.3123
sqft_lot.lmg      0.0097     ________I 0.0083 0.0110
floors.lmg        0.0410     _____F___ 0.0388 0.0437
waterfront.lmg    0.0214     ______GH_ 0.0177 0.0259
view.lmg          0.0682     ___D_____ 0.0619 0.0741
condition.lmg     0.0176     ______GH_ 0.0150 0.0201
grade.lmg         0.3451     A________ 0.3385 0.3521
                                                    
bedrooms.last     0.0108     _____FG__ 0.0063 0.0208
bathrooms.last    0.0007     _______HI 0.0000 0.0021
sqft_living.last  0.2664     _B_______ 0.2513 0.2898
sqft_lot.last     0.0168     _____FG__ 0.0123 0.0233
floors.last       0.0012     _______HI 0.0003 0.0035
waterfront.last   0.0395     ____E____ 0.0285 0.0487
view.last         0.0890     __CD_____ 0.0724 0.1065
condition.last    0.1001     __CD_____ 0.0868 0.1115
grade.last        0.4755     A________ 0.4524 0.4976
                                                    
bedrooms.first    0.0743     ___D_____ 0.0676 0.0789
bathrooms.first   0.1829     __C______ 0.1787 0.1873
sqft_living.first 0.2897     _B_______ 0.2860 0.2930
sqft_lot.first    0.0114     _______H_ 0.0091 0.0135
floors.first      0.0581     _____F___ 0.0548 0.0616
waterfront.first  0.0184     ______G__ 0.0150 0.0230
view.first        0.0659     ____E____ 0.0612 0.0701
condition.first   0.0009     ________I 0.0005 0.0017
grade.first       0.2984     A________ 0.2937 0.3026

Letters indicate the ranks covered by bootstrap CIs. 
(Rank bootstrap confidence intervals always obtained by percentile method) 
CAUTION: Bootstrap confidence intervals can be somewhat liberal. 

 
 Differences between Relative Contributions: 
 
                                             Lower   Upper
                             difference 0.95 0.95    0.95   
bedrooms-bathrooms.lmg       -0.0844     *   -0.0900 -0.0811
bedrooms-sqft_living.lmg     -0.2545     *   -0.2611 -0.2467
bedrooms-sqft_lot.lmg         0.0431     *    0.0387  0.0478
bedrooms-floors.lmg           0.0117     *    0.0070  0.0151
bedrooms-waterfront.lmg       0.0313     *    0.0250  0.0365
bedrooms-view.lmg            -0.0154     *   -0.0220 -0.0097
bedrooms-condition.lmg        0.0351     *    0.0309  0.0401
bedrooms-grade.lmg           -0.2924     *   -0.3014 -0.2825
bathrooms-sqft_living.lmg    -0.1701     *   -0.1753 -0.1604
bathrooms-sqft_lot.lmg        0.1275     *    0.1218  0.1335
bathrooms-floors.lmg          0.0961     *    0.0902  0.1022
bathrooms-waterfront.lmg      0.1157     *    0.1103  0.1224
bathrooms-view.lmg            0.0690     *    0.0604  0.0764
bathrooms-condition.lmg       0.1195     *    0.1143  0.1257
bathrooms-grade.lmg          -0.2080     *   -0.2197 -0.1987
sqft_living-sqft_lot.lmg      0.2976     *    0.2919  0.3028
sqft_living-floors.lmg        0.2662     *    0.2589  0.2721
sqft_living-waterfront.lmg    0.2858     *    0.2785  0.2926
sqft_living-view.lmg          0.2390     *    0.2317  0.2478
sqft_living-condition.lmg     0.2896     *    0.2826  0.2963
sqft_living-grade.lmg        -0.0379     *   -0.0465 -0.0276
sqft_lot-floors.lmg          -0.0314     *   -0.0351 -0.0287
sqft_lot-waterfront.lmg      -0.0117     *   -0.0162 -0.0080
sqft_lot-view.lmg            -0.0585     *   -0.0648 -0.0515
sqft_lot-condition.lmg       -0.0079     *   -0.0110 -0.0055
sqft_lot-grade.lmg           -0.3354     *   -0.3417 -0.3293
floors-waterfront.lmg         0.0197     *    0.0148  0.0247
floors-view.lmg              -0.0271     *   -0.0344 -0.0193
floors-condition.lmg          0.0234     *    0.0204  0.0274
floors-grade.lmg             -0.3040     *   -0.3100 -0.2962
waterfront-view.lmg          -0.0468     *   -0.0542 -0.0401
waterfront-condition.lmg      0.0038         -0.0015  0.0099
waterfront-grade.lmg         -0.3237     *   -0.3313 -0.3141
view-condition.lmg            0.0506     *    0.0440  0.0568
view-grade.lmg               -0.2769     *   -0.2892 -0.2650
condition-grade.lmg          -0.3275     *   -0.3342 -0.3214
                                                            
bedrooms-bathrooms.last       0.0101     *    0.0049  0.0205
bedrooms-sqft_living.last    -0.2556     *   -0.2794 -0.2415
bedrooms-sqft_lot.last       -0.0060         -0.0128  0.0044
bedrooms-floors.last          0.0096     *    0.0038  0.0196
bedrooms-waterfront.last     -0.0287     *   -0.0402 -0.0157
bedrooms-view.last           -0.0782     *   -0.0950 -0.0596
bedrooms-condition.last      -0.0893     *   -0.1025 -0.0750
bedrooms-grade.last          -0.4647     *   -0.4884 -0.4365
bathrooms-sqft_living.last   -0.2658     *   -0.2882 -0.2509
bathrooms-sqft_lot.last      -0.0161     *   -0.0225 -0.0113
bathrooms-floors.last        -0.0006         -0.0027  0.0010
bathrooms-waterfront.last    -0.0388     *   -0.0472 -0.0282
bathrooms-view.last          -0.0883     *   -0.1064 -0.0715
bathrooms-condition.last     -0.0994     *   -0.1111 -0.0854
bathrooms-grade.last         -0.4749     *   -0.4970 -0.4516
sqft_living-sqft_lot.last     0.2497     *    0.2348  0.2712
sqft_living-floors.last       0.2652     *    0.2486  0.2881
sqft_living-waterfront.last   0.2270     *    0.2099  0.2496
sqft_living-view.last         0.1774     *    0.1514  0.2064
sqft_living-condition.last    0.1663     *    0.1419  0.1998
sqft_living-grade.last       -0.2091     *   -0.2420 -0.1655
sqft_lot-floors.last          0.0155     *    0.0096  0.0215
sqft_lot-waterfront.last     -0.0227     *   -0.0313 -0.0106
sqft_lot-view.last           -0.0722     *   -0.0920 -0.0556
sqft_lot-condition.last      -0.0833     *   -0.0960 -0.0672
sqft_lot-grade.last          -0.4588     *   -0.4802 -0.4345
floors-waterfront.last       -0.0383     *   -0.0474 -0.0267
floors-view.last             -0.0878     *   -0.1056 -0.0710
floors-condition.last        -0.0989     *   -0.1100 -0.0856
floors-grade.last            -0.4743     *   -0.4969 -0.4503
waterfront-view.last         -0.0495     *   -0.0727 -0.0292
waterfront-condition.last    -0.0606     *   -0.0764 -0.0404
waterfront-grade.last        -0.4360     *   -0.4658 -0.4090
view-condition.last          -0.0111         -0.0328  0.0126
view-grade.last              -0.3865     *   -0.4203 -0.3528
condition-grade.last         -0.3754     *   -0.4077 -0.3519
                                                            
bedrooms-bathrooms.first     -0.1086     *   -0.1168 -0.1039
bedrooms-sqft_living.first   -0.2155     *   -0.2230 -0.2082
bedrooms-sqft_lot.first       0.0629     *    0.0559  0.0690
bedrooms-floors.first         0.0161     *    0.0080  0.0217
bedrooms-waterfront.first     0.0559     *    0.0466  0.0625
bedrooms-view.first           0.0083         -0.0004  0.0163
bedrooms-condition.first      0.0733     *    0.0667  0.0779
bedrooms-grade.first         -0.2242     *   -0.2332 -0.2158
bathrooms-sqft_living.first  -0.1069     *   -0.1119 -0.0999
bathrooms-sqft_lot.first      0.1715     *    0.1663  0.1771
bathrooms-floors.first        0.1247     *    0.1201  0.1308
bathrooms-waterfront.first    0.1645     *    0.1594  0.1698
bathrooms-view.first          0.1169     *    0.1101  0.1236
bathrooms-condition.first     0.1819     *    0.1778  0.1866
bathrooms-grade.first        -0.1156     *   -0.1238 -0.1076
sqft_living-sqft_lot.first    0.2784     *    0.2743  0.2822
sqft_living-floors.first      0.2316     *    0.2261  0.2365
sqft_living-waterfront.first  0.2714     *    0.2648  0.2772
sqft_living-view.first        0.2238     *    0.2181  0.2307
sqft_living-condition.first   0.2888     *    0.2847  0.2918
sqft_living-grade.first      -0.0087     *   -0.0136 -0.0039
sqft_lot-floors.first        -0.0468     *   -0.0518 -0.0425
sqft_lot-waterfront.first    -0.0070     *   -0.0116 -0.0036
sqft_lot-view.first          -0.0546     *   -0.0594 -0.0499
sqft_lot-condition.first      0.0104     *    0.0084  0.0126
sqft_lot-grade.first         -0.2870     *   -0.2905 -0.2832
floors-waterfront.first       0.0398     *    0.0344  0.0449
floors-view.first            -0.0078     *   -0.0145 -0.0007
floors-condition.first        0.0572     *    0.0534  0.0610
floors-grade.first           -0.2403     *   -0.2448 -0.2344
waterfront-view.first        -0.0476     *   -0.0535 -0.0414
waterfront-condition.first    0.0174     *    0.0139  0.0224
waterfront-grade.first       -0.2800     *   -0.2861 -0.2730
view-condition.first          0.0650     *    0.0604  0.0691
view-grade.first             -0.2325     *   -0.2390 -0.2251
condition-grade.first        -0.2975     *   -0.3018 -0.2932

* indicates that CI for difference does not include 0. 
CAUTION: Bootstrap confidence intervals can be somewhat liberal.


getpvalue <- function(x)  {
  out <- t.test(x, alternative="greater")$p.value
}
gettvalue <- function(x)  {
  out <- t.test(x, alternative="greater")$stat
}
# Testing if a mean of a vector is 0 and increasing the sample size (length) of the vector
nuse <- c(100,300,500,1000,2500,5000,10000) # sample sizes
muse <- c(0,0.01,0.05,0.1,0.25)             # mean values
B    <- 500
Pmat <- list()
for (mn in (1:length(muse))) {
  Pmat[[mn]] <- matrix(0,B,length(nuse))
  for (nn in (1:length(nuse))) {
     X <- matrix(rnorm(nuse[nn]*B,mean=muse[mn]),B,nuse[nn])
     Pmat[[mn]][,nn] <- apply(X,1,getpvalue) 
            }
          }


##### Summarizing the results from the B runs above
Muse <- rep(muse,each=length(nuse))
Nuse <- rep(nuse,length(muse))
MM <- 0
MU <- 0
ML <- 0
MS <- 0
for (mn in (1:length(muse))) {
    MM <- c(MM,apply(Pmat[[mn]],2,mean))
    MS <- c(MS,apply(Pmat[[mn]],2,sd))
    MU <- c(MU,apply(Pmat[[mn]],2,quantile, probs=.975)) # 95perc interval
    ML <- c(ML,apply(Pmat[[mn]],2,quantile, probs=.025)) }
MM <- MM[-1]
MU <- MU[-1]
ML <- ML[-1]
MS <- MS[-1]
df <- as.data.frame(cbind(Muse,Nuse,MM,MU,ML,MS))
names(df) <- c("mu","n","meanval","upper","lower","sd")
head(df)


ggplot(data = df, aes(x = n, group = mu)) + 
  geom_line(aes(x = n, y = (meanval), color = mu), size = 1) +
  geom_ribbon(aes(y = (meanval), ymin = meanval-sd, ymax = meanval+sd, fill = mu), alpha = .2) +
  xlab("sample size") +
  ylab("p-value")


nuse <- c(150,300,500,1000,5000,10000,15000)
B    <- 250
mm0<-lm(price~sqft_living+bedrooms+condition+grade,data=kc5)
Pmat <- list()
Coefmat <- list()
for (zz in (1:(dim(summary(mm0)$coef)[1]-1))) {
    Pmat[[zz]] <- matrix(0,B,length(nuse))
    Coefmat[[zz]] <- Pmat[[zz]] }
#
for (bb in (1:B)) {
  for (nn in (1:length(nuse))) {
     ii <- sample(seq(1,dim(kc5)[1]),nuse[nn])
     mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc5,subset=ii)
     ms <- summary(mm)$coef[-1,]
     for (zz in (1:(dim(summary(mm)$coef)[1]-1))) {
        pv <- log10(ms[zz,4])
        if (pv==-Inf) { pv <- -325}
        Pmat[[zz]][bb,nn] <- pv
        Coefmat[[zz]][bb,nn] <- ms[zz,1] }
      } }


np <- length(Pmat) # number of coefficients
Nuse <- rep(nuse,np)
feature <- rep(c(1,2,3,4),each=length(nuse))
MM <- 0
MU <- 0
ML <- 0
MS <- 0
CM <- 0
CU <- 0
CL <- 0
CS <- 0
for (mn in (1:np)) {
    MM <- c(MM,apply((Pmat[[mn]]),2,mean))
    MS <- c(MS,apply((Pmat[[mn]]),2,sd))
    MU <- c(MU,apply((Pmat[[mn]]),2,quantile, probs=.975))
    ML <- c(ML,apply((Pmat[[mn]]),2,quantile, probs=.025)) 
    CM <- c(CM,apply((Coefmat[[mn]]),2,mean))
    CS <- c(CS,apply((Coefmat[[mn]]),2,sd))
    CU <- c(CU,apply((Coefmat[[mn]]),2,quantile, probs=.975))
    CL <- c(CL,apply((Coefmat[[mn]]),2,quantile, probs=.025)) }
MM <- MM[-1]
MU <- MU[-1]
ML <- ML[-1]
MS <- MS[-1]
CM <- CM[-1]
CU <- CU[-1]
CL <- CL[-1]
CS <- CS[-1]
#


df <- as.data.frame(cbind(feature,Nuse,MM,MU,ML,MS,CM,CU,CL,CS))
names(df) <- c("feature","n","pmean","pupper","plower","psd","cmean","cupper","clower","csd")
#
ggplot(data = df, aes(x = n, y=pmean, group = feature)) + 
  geom_line(aes(x = n, y = pmean, color = feature), size = 1) +
  geom_ribbon(aes(y = (pmean), ymin = plower, ymax = pupper, fill = feature), alpha = .2) +
  xlab("sample size") +
  ylab("log10(p-value)")

ggplot(data = df, aes(x = n, y=cmean, group = feature)) + 
  geom_line(aes(x = n, y = cmean, color = feature), size = 1) +
  geom_hline(yintercept=0, color="darkorange",linetype="dashed") +
  geom_ribbon(aes(y = (cmean), ymin = clower, ymax = cupper, fill = feature), alpha = .2) +
  xlab("sample size") +
  ylab("Coefficient value")


# with extra noise
kcn<-kc5
kcn$noise1 <- rnorm(dim(kcn)[1])
kcn$price <- kcn$price + rnorm(dim(kcn)[1],sd=.25) # add some noise to the problem, play with this to see what happens
#
nuse <- c(150,300,500,1000,5000,10000,15000)
B    <- 250
mm0<-lm(price~sqft_living+bedrooms+condition+grade+noise1,data=kcn)
Pmat <- list()
Coefmat <- list()
for (zz in (1:(dim(summary(mm0)$coef)[1]-1))) {
    Pmat[[zz]] <- matrix(0,B,length(nuse))
    Coefmat[[zz]] <- Pmat[[zz]] }
#
np <- length(Pmat) # number of coefficients
for (bb in (1:B)) {
  for (nn in (1:length(nuse))) {
     ii <- sample(seq(1,dim(kcn)[1]),nuse[nn])
     mm<-lm(price~sqft_living+bedrooms+condition+grade+noise1,data=kcn,subset=ii)
     ms <- summary(mm)$coef[-1,]
     for (zz in (1:(dim(summary(mm)$coef)[1]-1))) {
        pv <- log10(ms[zz,4])
        if (pv==-Inf) { pv <- -325}
        Pmat[[zz]][bb,nn] <- pv
        Coefmat[[zz]][bb,nn] <- ms[zz,1] }
      } }
Nuse <- rep(nuse,np)
feature <- rep(seq(1,np),each=length(nuse))
MM <- 0
MU <- 0
ML <- 0
MS <- 0
CM <- 0
CU <- 0
CL <- 0
CS <- 0
for (mn in (1:np)) {
    MM <- c(MM,apply((Pmat[[mn]]),2,mean))
    MS <- c(MS,apply((Pmat[[mn]]),2,sd))
    MU <- c(MU,apply((Pmat[[mn]]),2,quantile, probs=.975))
    ML <- c(ML,apply((Pmat[[mn]]),2,quantile, probs=.025)) 
    CM <- c(CM,apply((Coefmat[[mn]]),2,mean))
    CS <- c(CS,apply((Coefmat[[mn]]),2,sd))
    CU <- c(CU,apply((Coefmat[[mn]]),2,quantile, probs=.975))
    CL <- c(CL,apply((Coefmat[[mn]]),2,quantile, probs=.025)) }
MM <- MM[-1]
MU <- MU[-1]
ML <- ML[-1]
MS <- MS[-1]
CM <- CM[-1]
CU <- CU[-1]
CL <- CL[-1]
CS <- CS[-1]
#


df <- as.data.frame(cbind(feature,Nuse,MM,MU,ML,MS,CM,CU,CL,CS))
names(df) <- c("feature","n","pmean","pupper","plower","psd","cmean","cupper","clower","csd")
#
ggplot(data = df, aes(x = n, y=pmean, group = feature)) + 
  geom_line(aes(x = n, y = pmean, color = feature), size = 1) +
  geom_hline(yintercept=c(-2,-3), color="darkorange",linetype="dashed") +
  geom_ribbon(aes(y = (pmean), ymin = plower, ymax = pupper, fill = feature), alpha = .2) +
  xlab("sample size") +
  ylab("log10(p-value)")

ggplot(data = df, aes(x = n, y=pmean, group = feature)) + 
  geom_line(aes(x = n, y = pmean, color = feature), size = 1) +
  geom_hline(yintercept=c(-2,-3), color="darkorange",linetype="dashed") +
  geom_ribbon(aes(y = (pmean), ymin = plower, ymax = pupper, fill = feature), alpha = .2) +
  xlab("sample size") +
  ylab("log10(p-value)") +
  coord_cartesian(xlim = c(1, 5000), ylim=c(-10,0))


ggplot(data = df, aes(x = n, y=cmean, group = feature)) + 
  geom_line(aes(x = n, y = cmean, color = feature), size = 1) +
  geom_hline(yintercept=0, color="darkorange",linetype="dashed") +
  geom_ribbon(aes(y = (cmean), ymin = clower, ymax = cupper, fill = feature), alpha = .2) +
  xlab("sample size") +
  ylab("Coefficient value")


#########
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc5)
par(mfrow=c(2,2))
plot(mm)
summary(mm)

Call:
lm(formula = price ~ sqft_living + bedrooms + condition + grade, 
    data = kc5)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.51108 -0.10571  0.00238  0.09917  0.60312 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  5.666588   0.001018 5564.23   <2e-16 ***
sqft_living  0.094402   0.001911   49.40   <2e-16 ***
bedrooms    -0.015308   0.001330  -11.51   <2e-16 ***
condition    0.028861   0.001034   27.91   <2e-16 ***
grade        0.098832   0.001625   60.80   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1497 on 21607 degrees of freedom
Multiple R-squared:  0.5717,	Adjusted R-squared:  0.5716 
F-statistic:  7210 on 4 and 21607 DF,  p-value: < 2.2e-16


# Compare fit with a smaller sample size
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2,
       subset=sample(seq(1,21000),150))
par(mfrow=c(2,2))
plot(mm)
summary(mm)

Call:
lm(formula = price ~ sqft_living + bedrooms + condition + grade, 
    data = kc2, subset = sample(seq(1, 21000), 150))

Residuals:
     Min       1Q   Median       3Q      Max 
-0.38281 -0.10708  0.01146  0.11468  0.38805 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.918e+00  1.394e-01  35.275  < 2e-16 ***
sqft_living  1.589e-04  3.001e-05   5.296 4.31e-07 ***
bedrooms    -1.458e-02  1.783e-02  -0.818  0.41492    
condition    3.577e-02  2.189e-02   1.634  0.10433    
grade        4.763e-02  1.741e-02   2.735  0.00701 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1647 on 145 degrees of freedom
Multiple R-squared:  0.5562,	Adjusted R-squared:  0.5439 
F-statistic: 45.43 on 4 and 145 DF,  p-value: < 2.2e-16


# Compare fit with a smaller sample size
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2,
       subset=sample(seq(1,21000),1500))
par(mfrow=c(2,2))
plot(mm)
summary(mm)

Call:
lm(formula = price ~ sqft_living + bedrooms + condition + grade, 
    data = kc2, subset = sample(seq(1, 21000), 1500))

Residuals:
     Min       1Q   Median       3Q      Max 
-0.46710 -0.11034  0.00268  0.09924  0.59487 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.609e+00  4.280e-02 107.672  < 2e-16 ***
sqft_living  9.051e-05  7.640e-06  11.846  < 2e-16 ***
bedrooms    -6.395e-03  5.391e-03  -1.186    0.236    
condition    4.281e-02  6.005e-03   7.130 1.56e-12 ***
grade        9.707e-02  5.288e-03  18.358  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1506 on 1495 degrees of freedom
Multiple R-squared:  0.5899,	Adjusted R-squared:  0.5888 
F-statistic: 537.7 on 4 and 1495 DF,  p-value: < 2.2e-16

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
	<dbl>	<chr>	<dbl>	<int>	<dbl>	<int>	<int>	<dbl>	<int>	<int>	...	<int>	<int>	<int>	<int>	<int>	<int>	<dbl>	<dbl>	<int>	<int>
1	7129300520	20141013T000000	221900	3	1.00	1180	5650	1	0	0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
2	6414100192	20141209T000000	538000	3	2.25	2570	7242	2	0	0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
3	5631500400	20150225T000000	180000	2	1.00	770	10000	1	0	0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
4	2487200875	20141209T000000	604000	4	3.00	1960	5000	1	0	0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
5	1954400510	20150218T000000	510000	3	2.00	1680	8080	1	0	0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503
6	7237550310	20140512T000000	1225000	4	4.50	5420	101930	1	0	0	...	11	3890	1530	2001	0	98053	47.6561	-122.005	4760	101930

	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade
	<dbl>	<int>	<dbl>	<int>	<int>	<dbl>	<int>	<int>	<int>	<int>
1	5.346157	3	1.00	1180	5650	1	0	0	3	7
2	5.730782	3	2.25	2570	7242	2	0	0	3	7
3	5.255273	2	1.00	770	10000	1	0	0	3	6
4	5.781037	4	3.00	1960	5000	1	0	0	5	7
5	5.707570	3	2.00	1680	8080	1	0	0	3	8
6	6.088136	4	4.50	5420	101930	1	0	0	3	11

	mu	n	meanval	upper	lower	sd
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	0	100	0.5014948	0.9680348	0.03215405	0.2854941
2	0	300	0.4948069	0.9718056	0.02086150	0.2872153
3	0	500	0.4774176	0.9701798	0.02096263	0.2967088
4	0	1000	0.4729536	0.9774313	0.04561669	0.2886345
5	0	2500	0.4914866	0.9619954	0.02848797	0.2789043
6	0	5000	0.5154574	0.9748849	0.02461024	0.2930485

Big n methods¶