/***********************************************************/ /* This macro performs k-fold cross validation */ /* with proc mixed. */ /* */ /* K-fold cross validation divides the dataset randomly */ /* into K parts. K-1 parts are used for training and */ /* 1 part is used for validation. This is repeated K */ /* times so that each part is used for calibration once. */ /* It allows the whole dataset to be used for calibration */ /* and validation. */ /* */ /* */ /* The dataset can be found in the .txt file cv_example. */ /* */ /* */ /* The macro requires a variable called ID that uniquely */ /* identifies the observations. */ /* */ /* It produces a Pearson's correlation coefficient for the */ /* observed vs predicted from the "left out" folds, and */ /* the predicted values can be found in the dataset "final"*/ /* */ /* mydata = dataset with original data */ /* class = list of variables for class statment */ /* model = list of terms in model statement */ /* stmts = %str(other mixed statements, incl. random) */ /* y = name of response variable in dataset */ /* k = number of folds */ /* */ /***********************************************************/ %macro cv(mydata=, model=, class=, stmts=, y=, k=); data whatever; i=1; set &mydata point=i nobs=n; put 'n = ' n; call symput('size',trim(left(n))); stop; run; /*create column with n observations, for the fold numbers*/ data folds; check1=&k; check2=&size; fold_size=ceil(&size/&k); /*calculate the size of the folds by dividing sample size through fold number*/ /*if n/k is not an integer (eg. 502/5), we will have two fold sizes (2 folds of 101 (larger), 3 folds of 100 (smaller)*/ larger=&size-&k*(fold_size-1); /*larger=number of folds we need of the larger size=foldsize*/ smaller=&k - larger; /*this many folds of the smaller size = foldsize-1*/ do fold=1 to larger; /*first generate variable fold for the larger folds*/ do i=1 to fold_size; output; end; end; do fold=larger+1 to &k; /*now do the smaller folds*/ do i=1 to fold_size-1; output; end; end; run; data folds; set folds; keep fold; run; /*randomize fold labels*/ data folds_rand; set folds; r=ranuni(-1); proc sort data=folds_rand out=folds_rand; by r; run; data cv; merge &mydata folds_rand; run; proc print data=cv; run; %do f=1 %to &k; data cv&f; set cv; ff=&f; if fold=&f then &y=.; run; %end; /*concatenate cv1 up to cv&k*/ data cv_all; run; %do f=1 %to &k; data cv_all; set cv_all cv&f; run; %end; data cv_all; set cv_all; if fold=. then delete; run; proc mixed data=cv_all; class &class; model &y=&model /outp=pred ddfm=kr; &stmts; by ff; run; data pred; set pred; if &y=.; /*only keep predictons from "left out" observations for that fold */ keep id pred; run; proc sort data=pred out=pred; by id; run; data final; merge &mydata pred; /*merge "left out" predictions from all k folds*/ by id; run; proc corr data=final; var &y pred; run; %mend;