将分类变量输入glmnet时,我是编码n还是n-1虚拟变量?
例如,如果使用星期几作为自变量,我会使用 6 个假人还是 7 个?
如果答案是 6,我如何解释丢弃类别的系数等?
编辑:这是一些示例代码:
library(glmnet)
library(caret)
df1 <- data.frame(id = 1:210, var1 = rep(c('Mon','Tues','Wed','Thurs','Fri','Sat','Sun'),30))
df1$targetVar <- runif(210)
df1$mktVol <- round(runif(210)*1000000,0)
df1$mktVol <- ifelse(df1$var1 %in% c('Sat','Sun'), 0, df1$mktVol)
df1
vtu <- c('mktVol','var1')
dv1 <- dummyVars( ~.,data = df1[,vtu])
df2 <- data.frame(predict(dv1,df1))
glmnet1 <- cv.glmnet(df2$targetVar, data.matrix(df2[,-c('targetVar')]), nfolds = 5)
glmnet1 <- cv.glmnet( data.matrix(df2[,-1]), df2[,"mktVol"] ,
family="gaussian", alpha=.95, nfolds=5, standardize = FALSE,
type.measure="mse")
Coefficients1 <- coef(glmnet1, s = glmnet1$lambda.min)
Active.Index <- which(Coefficients != 0)
Active.Coefficients <- Coefficients[Active.Index]
names(X1)[varsToUse[Active.Index]]
##############################
df1 <- data.frame(id = 1:210, var1 = rep(c('Mon','Tues','Wed','Thurs','Fri','Sat','Sun'),30))
df1$targetVar <- runif(210)
df1$mktVol <- round(runif(210)*1000000,0)
df1$mktVol <- ifelse(df1$var1 %in% c('Sat','Sun'), 0, df1$mktVol)
df1
vtu <- c('mktVol','var1')
#dv1 <- dummyVars( ~.,data = df1[,vtu])
#df2 <- data.frame(predict(dv1,df1))
dv1 <- model.matrix(~.,data = df1[,vtu])
#glmnet1 <- cv.glmnet(df2$targetVar, data.matrix(df2[,-c('targetVar')]), nfolds = 5)
glmnet1 <- cv.glmnet( data.matrix(df2[,-1]), df2[,"mktVol"] ,
family="gaussian", alpha=.95, nfolds=5, standardize = FALSE,
type.measure="mse")
Coefficients2 <- coef(glmnet1, s = glmnet1$lambda.min)
##############################
df1 <- data.frame(id = 1:210, var1 = rep(c('Mon','Tues','Wed','Thurs','Fri','Sat','Sun'),30))
df1$targetVar <- runif(210)
df1$mktVol <- round(runif(210)*1000000,0)
df1$mktVol <- ifelse(df1$var1 %in% c('Sat','Sun'), 0, df1$mktVol)
df1
vtu <- c('mktVol','var1')
#dv1 <- dummyVars( ~.,data = df1[,vtu])
#df2 <- data.frame(predict(dv1,df1))
dv1 <- model.matrix(~ 0+ .,data = df1[,vtu])
#glmnet1 <- cv.glmnet(df2$targetVar, data.matrix(df2[,-c('targetVar')]), nfolds = 5)
glmnet1 <- cv.glmnet( data.matrix(df2[,-1]), df2[,"mktVol"] ,
family="gaussian", alpha=.95, nfolds=5, standardize = FALSE,
type.measure="mse")
Coefficients3 <- coef(glmnet1, s = glmnet1$lambda.min)
Coefficients1
Coefficients2
Coefficients3