## ---------------------------------------------------------------------------- ## R scripts for Haerdle/Mueller/Sperlich/Werwatz: "Nonparametric and ## Semiparametric Modelling", Springer Series in Statistics, 2004 ## ---------------------------------------------------------------------------- ## Script SPMlogitkredit ## ---------------------------------------------------------------------------- ## Description computes various logit and probit fits for the kredit data ## ---------------------------------------------------------------------------- ## Author Marlene Mueller, 2009/01/14 ## ---------------------------------------------------------------------------- file <- read.csv("D:\\kredit.csv",sep=";") y <- 1-file$kredit # default set to 1 prev <- (file$moral >2)+0 # previous loans were OK employ <- (file$beszeit >1)+0 # employed (>=1 year) dura <- (file$laufzeit) # duration d9.12 <- ((file$laufzeit >9)&(file$laufzeit <=12)) +0 # 9 < duration <= 12 d12.18 <- ((file$laufzeit >12)&(file$laufzeit <=18))+0 # 12 < duration <= 18 d18.24 <- ((file$laufzeit >18)&(file$laufzeit <=24))+0 # 18 < duration <= 24 d24 <- (file$laufzeit >24)+0 # 24 < duration amount <- file$hoehe # amount of loan age <- file$alter # age of applicant savings <- (file$sparkont > 4)+0 # savings >= 1000 DM phone <- (file$telef==1)+0 # applicant has telephone foreign <- (file$gastarb==1)+0 # non-german citizen purpose <- ((file$verw==1)|(file$verw==2))+0 # loan is for a car house <- (file$verm==4)+0 # house owner ## Logit: dependence on age g<-glm(y~age,family=binomial) summary(g) ## barplot(table(y,age),xlab="AGE",ylab="Frequency|Y") ## o<-order(age) plot(age[o],g$fitted.values[o],ylim=c(0,1),xlab="AGE",ylab="P(Y=1|AGE)",pch=16) lines(age[o],g$fitted.values[o]) ## Probit: dependence on age (fit using the build-in GLM) gp<-glm(y~age,family=binomial(link="probit")) summary(gp) ## Probit: dependence on age ("hand-made" fit) dx<-(cbind(rep(1,length(age)),age)) b0<-rep(0,2) devi<-function(b,x,y){ # deviance (to minimize) mu<-pnorm(x%*%b) d<- -2*sum(y * log(ifelse(y == 0, 1, mu)) + (1 - y) * log(ifelse(y == 1, 1, (1 - mu)))) return(d)} Ddevi<-function(b,x,y){ # gradient mu<-pnorm(x%*%b) D<- -2*apply(x*as.vector(y-mu),2,sum) return(D)} opt<-optim(b0, devi, Ddevi, method = "BFGS",x=dx,y=y,hessian=TRUE) devi(opt$par,dx,y) devi(gp$coefficients,dx,y) ## Logit: dependence on age incl. quadratic terms g2<-glm(y~ age + I(age^2),family=binomial) summary(g2) anova(g,g2,test="Chisq") plot(age[o],g2$linear.predictors[o],xlab="AGE",ylab="eta(AGE)",pch=16) lines(age[o],g2$linear.predictors[o]) ## show est. PDs plot(age[o],g2$fitted.values[o],ylim=c(0,1),xlab="AGE",ylab="P(Y=1|AGE)",pch=16) lines(age[o],g2$fitted.values[o]) ## Logit: dependence on age incl. quadratic and cubic terms g3<-glm(y~ age + I(age^2)+ I(age^3),family=binomial) summary(g3) anova(g2,g3,test="Chisq") plot(age[o],g3$linear.predictors[o],xlab="AGE",ylab="eta(AGE)",pch=16) lines(age[o],g3$linear.predictors[o]) ## show est. PDs plot(age[o],g3$fitted.values[o],ylim=c(0,1),xlab="AGE",ylab="P(Y=1|AGE)",pch=16) lines(age[o],g3$fitted.values[o]) ## Logit: dependence on categorized age agec<-cut(as.numeric(age), breaks=c(18,28,38,48,58,68,75)) g5<-glm(y~ agec,family=binomial) summary(g5) ## Logit: dependence on categorized age (finer categorization) agec<-cut(as.numeric(age), breaks=c(18,23,28,33,38,43,48,53,58,63,68,75)) barplot(table(y,agec)) g6<-glm(y~ agec,family=binomial) summary(g6) plot(age[o],g6$linear.predictors[o],xlab="AGE",ylab="eta(AGE)",pch=16) lines(age[o],g6$linear.predictors[o]) ## show est. PDs plot(age[o],g6$fitted.values[o],ylim=c(0,1),xlab="AGE",ylab="P(Y=1|AGE)",pch=16) lines(age[o],g6$fitted.values[o]) ## Logit: dependence on age and amount g7<-glm(y~ age + amount,family=binomial) summary(g7) anova(g,g7,test="Chisq") ## Logit: dependence on age and amount (with interaction) g8<-glm(y~ age + amount + age:amount,family=binomial) summary(g8) anova(g7,g8,test="Chisq") ## Logit: dependence on age and amount incl. quadratic terms g9<-glm(y~ age + amount + I(age^2) + I(amount^2),family=binomial) summary(g9) anova(g7,g9,test="Chisq") ## Logit: dependence on age and amount incl. quadratic terms ## (with interaction and model choice) g10<-glm(y~ age + amount + I(age^2) + I(amount^2)+ age:amount,family=binomial) summary(g10) anova(g9,g10,test="Chisq") library(MASS) s10 <- stepAIC(g10) s10$anova summary(s10) ## plot the 2-dim surface x1<-seq(min(age),max(age),length=12) x2<-seq(min(amount),max(amount),length=12) f10<-function(x1,x2,b){f<-b[1]+b[2]*x1+b[3]*x2+b[4]*x1^2+b[5]*x2^2+b[6]*x1*x2; return(f)} e10<-outer(x1,x2,f10,g10$coefficients) persp(x1,x2,e10,theta = 30, phi = 30, expand = 0.9, col = "lightblue",xlab="AGE",ylab="AMOUNT",zlab="eta") contour(x1,x2,e10,xlab="AGE",ylab="AMOUNT") ## Logit: dependence on several rating factors (with model choice) g11<-glm(y~ age + amount + I(age^2) + I(amount^2)+ age:amount + prev+ employ +d9.12 + d12.18 + d18.24 +d24 + savings +purpose +house,family=binomial) summary(g11) ## model choice s11<-stepAIC(g11) s11$anova summary(s11)