/***********************************************************/
/* This macro performs k-fold cross validation 			   */
/* with proc mixed.                                        */
/* 														   */
/* K-fold cross validation divides the dataset randomly    */
/* into K parts. K-1 parts are used for training and       */
/* 1 part is used for validation. This is repeated K       */
/* times so that each part is used for calibration once.   */
/* It allows the whole dataset to be used for calibration  */
/* and validation.  									   */
/* 														   */
/* 														   */
/* The dataset can be found in the .txt file cv_example.   */
/* 														   */
/* 														   */
/* The macro requires a variable called ID that uniquely   */
/* identifies the observations.							   */
/* 														   */
/* It produces a Pearson's correlation coefficient for the */
/* observed vs predicted from the "left out" folds, and    */
/* the predicted values can be found in the dataset "final"*/
/* 														   */
/*   mydata   = dataset with original data                 */
/*   class    = list of variables for class statment       */
/*   model    = list of terms in model statement           */
/*   stmts    = %str(other mixed statements, incl. random) */
/*   y		  = name of response variable in dataset       */
/*	 k		  = number of folds							   */
/*                                                         */
/***********************************************************/

%macro cv(mydata=, model=, class=, stmts=, y=, k=);
  data whatever;
  i=1;
  set &mydata point=i nobs=n;
  put 'n = ' n;
  call symput('size',trim(left(n)));
  stop;
  run; /*create column with n observations, for the fold numbers*/

  data folds;
  check1=&k;
  check2=&size;
  fold_size=ceil(&size/&k);  /*calculate the size of the folds by dividing sample size through fold number*/
  /*if n/k is not an integer (eg. 502/5), we will have two fold sizes (2 folds of 101 (larger), 3 folds of 100 (smaller)*/
  larger=&size-&k*(fold_size-1); /*larger=number of folds we need of the larger size=foldsize*/
  smaller=&k - larger; /*this many folds of the smaller size = foldsize-1*/
  do fold=1 to larger; /*first generate variable fold for the larger folds*/
    do i=1 to fold_size;
	  output;
	end;
  end;
  do fold=larger+1 to &k; /*now do the smaller folds*/
    do i=1 to fold_size-1;
	  output;
	end;
  end;
  run;

  data folds;
  set folds;
  keep fold;
  run;

  /*randomize fold labels*/

  data folds_rand;
  set folds;
  r=ranuni(-1);

  proc sort data=folds_rand out=folds_rand;
  by r;
  run;

  data cv;
  merge &mydata folds_rand;
  run;

  proc print data=cv;
  run;

  %do f=1 %to &k;
     data cv&f;
	 set cv;
	 ff=&f;
	 if fold=&f then &y=.;
	 run;
  %end;

  /*concatenate cv1 up to cv&k*/

  data cv_all;
  run;

  %do f=1 %to &k;
    data cv_all;
	set cv_all cv&f;
	run;
  %end;

  data cv_all;
  set cv_all;
  if fold=. then delete;
  run;

  proc mixed data=cv_all;
  class &class;
	model &y=&model /outp=pred ddfm=kr;
	&stmts;
  by ff;
  run;



  data pred;
  set pred;
  if &y=.; /*only keep predictons from "left out" observations for that fold */
  keep id pred;
  run;

  proc sort data=pred out=pred;
  by id;
  run;

  data final;
  merge &mydata pred; /*merge "left out" predictions from all k folds*/
  by id;
  run;

  proc corr data=final;
  var &y pred;
  run;

%mend;