/* nag_best_subset_given_size (h05abc) Example Program. * * Copyright 2011, Numerical Algorithms Group. * * Mark 23, 2011. */ /* Pre-processor includes */ #include #include #include #include #include #include #include #define BZ(I,J) bz[(J)*(m - ip) + (I)] typedef struct { Integer n, tdq, tdx; double tol; double *x, *y, *b, *cov, *h, *p, *q, *res, *se, *wt, *com_ar; Integer *sx, cnt; Nag_IncludeMean mean; } calc_subset_data; void read_subset_data(Integer m, calc_subset_data *cs); void tidy_subset_data(calc_subset_data *cs); void calc_subset_score(Integer m, Integer drop, Integer lz, const Integer z[], Integer la, const Integer a[], double score[], Nag_Comm *comm, Integer *info); int main(void) { int exit_status = 0; /* Integer scalar and array declarations */ Integer i, ip, j, la, m, mincnt, mincr, mip, nbest; Integer *a = 0, *bz = 0, *ibz = 0, *id = 0; /* NAG structures */ NagError fail; Nag_Comm comm; /* Other data types */ calc_subset_data cs; /* Double scalar and array declarations */ double gamma; double *bscore = 0; double acc[2]; /* Initialise the error structure */ INIT_FAIL(fail); printf("nag_best_subset_given_size (h05abc) Example Program Results\n\n"); /* Skip heading in data file */ scanf("%*[^\n] "); scanf("%*[^\n] "); /* Read in the problem size */ scanf("%ld%ld%ld%*[^\n] ", &m, &ip, &nbest); /* Read in the control parameters for the subset selection */ scanf("%ld%ld%lf%lf%lf%*[^\n] ", &mincr, &mincnt, &gamma, &acc[0], &acc[1]); /* Read in the data required for the score function */ read_subset_data(m, &cs); /* Associate the data structure with comm.p */ comm.p = (void *) &cs; /* Allocate memory required by the subset selection routine */ mip = m - ip; if (!(a = NAG_ALLOC(MAX(nbest,m), Integer)) || !(bz = NAG_ALLOC(mip*nbest, Integer)) || !(bscore = NAG_ALLOC(MAX(nbest,m),double))) { printf("Allocation failure\n"); exit_status = -1; goto END; } /* Call the forward communication best subset routine (nag_best_subset_given_size (h05abc)) */ nag_best_subset_given_size(mincr, m, ip, nbest, &la, bscore, bz, calc_subset_score, mincnt, gamma, acc, &comm, &fail); if (fail.code != NE_NOERROR) { printf("Error from nag_best_subset_given_size (h05abc).\n%s\n", fail.message); if (fail.code != NE_ARG_TOO_LARGE) { exit_status = 1; goto END; } } /* Titles */ printf(" Score Feature Subset\n"); printf(" ----- --------------\n"); /* Display the best subsets and corresponding scores. nag_best_subset_given_size returns a list of features excluded from the best subsets, so this is inverted to give the set of features included in each subset */ if (!(id = NAG_ALLOC(m, Integer))) { printf("Allocation failure\n"); exit_status = -1; goto END; } for (i = 0; i < la; i++) { printf("%12.5e ", bscore[i]); for (j = 0; j < m; j++) id[j] = 1; for (j = 0; j < mip; j++) id[BZ(j,i) - 1] = 0; for (j = 0; j < m; j++) if (id[j]) printf(" %5"NAG_IFMT, j+1); printf("\n"); } printf("\n"); if (fail.code == NE_TOO_MANY) { printf("%ld subsets of the requested size do not exist, " "only %ld are displayed.\n", nbest,la); } printf("%ld subsets evaluated in total\n", cs.cnt); END: NAG_FREE(a); NAG_FREE(bz); NAG_FREE(ibz); NAG_FREE(bscore); NAG_FREE(id); tidy_subset_data(&cs); return exit_status; } #define CSX(I, J) cs->x[(I) * cs->tdx + J] void read_subset_data(Integer m, calc_subset_data *cs) { /* Read in the data, from stdout, and allocate any arrays that are required by the scoring function calc_subset_score */ Integer i, j; cs->sx = 0; cs->x = cs->y = cs->b = cs->se = cs->cov = cs->res = cs->h = cs->q = cs->p = cs->com_ar = cs->wt = 0; /* Skip heading in data file */ scanf("%*[^\n] "); /* Read in the number of observations for the data used in the linear regression */ scanf("%ld%*[^\n] ", &cs->n); /* Read in the control parameters for the linear regression */ scanf("%lf%*[^\n] ",&cs->tol); /* Read in the data */ cs->tdx = m; if (!(cs->x = NAG_ALLOC(cs->tdx*cs->n, double)) || !(cs->y = NAG_ALLOC(cs->n,double))) { printf("Allocation failure\n"); tidy_subset_data(cs); exit(-1); } /* Read in the data for the linear regression */ for (i = 0; i < cs->n; i++) { for (j = 0; j < m; j++) scanf("%lf", &CSX(i,j)); scanf("%lf", &cs->y[i]); scanf("%*[^\n] "); } /* No intercept term and no weights */ cs->mean = Nag_MeanZero; /* Allocate memory required by the regression routine */ cs->tdq = cs->n; if (!(cs->sx = NAG_ALLOC(m, Integer)) || !(cs->b = NAG_ALLOC(m,double)) || !(cs->se = NAG_ALLOC(m,double)) || !(cs->cov = NAG_ALLOC(m*(m+1)/2,double)) || !(cs->res = NAG_ALLOC(cs->n,double)) || !(cs->h = NAG_ALLOC(cs->n,double)) || !(cs->q = NAG_ALLOC(cs->n*cs->tdq,double)) || !(cs->p = NAG_ALLOC(2*m+m*m,double)) || !(cs->com_ar = NAG_ALLOC(5*(m-1)*m*m,double))) { printf("Allocation failure\n"); tidy_subset_data(cs); exit(-1); } cs->cnt = 0; } void tidy_subset_data(calc_subset_data *cs) { /* Tidy up the data structure used by calc_subset_score */ NAG_FREE(cs->sx); NAG_FREE(cs->x); NAG_FREE(cs->y); NAG_FREE(cs->b); NAG_FREE(cs->se); NAG_FREE(cs->cov); NAG_FREE(cs->res); NAG_FREE(cs->h); NAG_FREE(cs->q); NAG_FREE(cs->p); NAG_FREE(cs->com_ar); NAG_FREE(cs->wt); } void calc_subset_score(Integer m,Integer drop,Integer lz,const Integer z[], Integer la,const Integer a[],double score[], Nag_Comm *comm, Integer *info) { /* Calculate the score associated with a particular set of feature subsets. This particular example finds the set, of a given size, of explanatory variables that best fit a response variable when a linear regression model is used. Therefore the feature set is the set of all the explanatory variables and the best set of features is defined as set of explanatory variables that gives the smallest residual sums of squares. See the documentation for g02dac for details on linear regression models. */ calc_subset_data *cs; /* Integer scalar and array declarations */ Integer i, j, inv_drop, ip, irank; /* NAG structures */ NagError fail; /* Double scalar and array declarations */ double rss, df; /* Other data types */ Nag_Boolean svd; /* Initialise the error structure */ INIT_FAIL(fail); /* Point cs to the data structure stored in comm */ cs = (calc_subset_data *) comm->p; /* Keep track of the number of susbets evaluated */ cs->cnt += MAX(1,la); /* Set up the initial feature set. If drop = 0, this is the Null set (i.e. no features). If drop = 1 then this is the full set (i.e. all features) */ for (i = 0; i < m; i++) cs->sx[i] = drop; /* Add (if drop = 0) or remove (if drop = 1) the all the features specified in Z (note: z refers to its features with values 1 to M, therefore you must subtract 1 when using z as an array reference) */ inv_drop = (drop == 0) ? 1 : 0; for (i = 0; i < lz; i++) cs->sx[z[i] - 1] = inv_drop; /* Loop over all the elements in a, looping at least once */ for (i = 0; i < MAX(la,1); i++) { if (la > 0) { if (i > 0) { /* Reset the feature altered at the last iteration */ cs->sx[a[i-1] - 1] = drop; } /* Add or drop the I'th feature in A */ cs->sx[a[i] - 1] = inv_drop; } /* Calculate number of parameters in the regression model */ for (ip = j = 0; j < m; j++) ip += (cs->sx[j] == 1); /* Fit the regression model (g02dac) */ nag_regsn_mult_linear(cs->mean, cs->n, cs->x, cs->tdx, m, cs->sx, ip, cs->y, cs->wt, &rss, &df, cs->b, cs->se, cs->cov, cs->res, cs->h, cs->q, cs->tdq, &svd, &irank, cs->p, cs->tol, cs->com_ar, &fail); /* Return the score (the residual sums of squares) */ score[i] = rss; } /* If g02dac fails, terminate early */ if (fail.code != NE_NOERROR) *info = 1; }