Real World Health Care Data Analysis. Uwe Siebert
Macro: MP_ASSIGN
Purpose: Find and create pooled missing value patterns
******************************************************************************;
* Input parameters:
* indata = input data set
* outdata = output data set
* varlist = a list of variables to be included in the propensity score
* estimation. Notice the variable type should be the same.
* M_MP_MIN = minimum number of observations for each missing pattern.
* Missing patterns with less than MIN_MP observations will be pooled.
******************************************************************************;
%MACRO MP_ASSIGN(MSDATA = , OUTDATA =, VARLIST =, N_MP_MIN = 100);
/* Determine how many variables to include in the propensity score estimation */
%LET N = 1;
%LET VARINT = ;
%DO %UNTIL(%QSCAN(&VARLIST., &N. , %STR( )) EQ %STR( ));
%LET VAR = %QSCAN(&VARLIST. , &N. , %STR( ));
%LET VARINT = &VARINT &VAR.*MP;
%LET N = %EVAL(&N. + 1);
%END;
%LET KO = %EVAL(&N-1);
%LET M_MISSING = %EVAL(&N-1);
%PUT &VARINT;
%PUT &KO;
%PUT &M_MISSING;
/* Create indicators for missing values and missingness patterns */
DATA MS;
SET &MSDATA;
ARRAY MS{&M_MISSING} M1-M&M_MISSING.;
ARRAY X{&M_MISSING} &VARLIST;
MV = 0;
DO I = 1 TO &M_MISSING;
IF X{I} = . THEN MS{I} = 1;
ELSE MS{I} = 0;
MV = 2*MV + MS{I};
END;
MV = MV + 1;
DROP I;
RUN;
/* Only keep one record for each missingness pattern */
PROC SORT DATA = MS OUT = PATTERN NODUPKEY;
BY MV;
RUN;
/* Calculate the number of observations in each missingness pattern */
PROC FREQ DATA = MS NOPRINT;
TABLES MV / OUT = M_MP(KEEP = MV COUNT);
RUN;
DATA PATTERN;
MERGE PATTERN M_MP;
BY MV;
RUN;
PROC SORT DATA = PATTERN;
BY DESCENDING COUNT;
RUN;
/* Assign missingness pattern to new index from the largest to the smallest */
DATA PATTERN;
RETAIN M1-M&M_MISSING MV COUNT MV_S;
SET PATTERN;
KEEP M1-M&M_MISSING MV COUNT MV_S;
MV_S = _N_;
RUN;
PROC IML;
USE PATTERN;
READ ALL INTO A;
CLOSE PATTERN;
MS = A[, 1:&M_MISSING];
MV = A[, 1+&M_MISSING];
N_MP = A[, 2+&M_MISSING];
MV_S = A[, 3+&M_MISSING];
M_MP = NROW(MS);
M = NCOL(MS);
/* Calculate the distance between missingness patterns */
DISTANCE = J(M_MP, M_MP, 0);
DO I = 1 TO M_MP;
DO J = 1 TO I-1;
D = 0;
DO L = 1 TO M;
D = D + ( (MS[I,L]-MS[J,L])*(MS[I,L]-MS[J,L]) );
END;
DISTANCE[I,J] = D;
DISTANCE[J,I] = D;
END;
END;
I = 0;
K_MV_POOL = 0;
MV_POOL = J(M_MP, 1, 0);
/*Pooling small missingness patterns according to their similarities to reach a prespecified minimum number of observations (&N_MP_MIN) in each pattern */
DO WHILE( I < M_MP);
I = I + 1;
IF MV_POOL[I] = 0 THEN
DO;
K_MV_POOL = K_MV_POOL + 1;
N_MP_POOL = N_MP[I];
IF N_MP_POOL >= &N_MP_MIN THEN
DO;
MV_POOL[I] = K_MV_POOL;
END;
ELSE
DO;
IF I < M_MP THEN
DO;
A = DISTANCE[(I+1):M_MP, I];
B = MV[(I+1):M_MP];
C = N_MP[(I+1):M_MP];
D = MV_S[(I+1):M_MP];
E = MV_POOL[(I+1):M_MP];
TT = A || B || C || D || E;
CALL SORT( TT, {1 3});
J = 0;
DO WHILE( (N_MP_POOL < &N_MP_MIN) & (I+J < M_MP) );
J = J+1;
IF (TT[J,5] = 0) THEN
DO;
N_MP_POOL = N_MP_POOL + TT[J,3];
TT[J,5] = K_MV_POOL;
END;
END;
END;
IF ( N_MP_POOL >= &N_MP_MIN ) THEN
DO;
MV_POOL[I] = K_MV_POOL;
DO K = 1 TO J;
MV_POOL[TT[K,4]] = K_MV_POOL;
END;
END;
ELSE
DO J = I TO M_MP;
SGN_TMP = 0;
K = 1;
DO WHILE(SGN_TMP = 0 & K <= M_MP);
DO L = 1 TO M_MP;
IF (DISTANCE[J,L] = K) & (MV_POOL[J]=0) &
(MV_POOL[L]>0) THEN
DO;
MV_POOL[J] = MV_POOL[L];
SGN_TMP = 1;
END;
END;