/*------------------------------------------------------------------------------ File: DeleteSuperfluousRecords.sas Author: John Sabel Washington State Education Research and Data Center Creation date: 2013-08-10 Version: 0.941 Purpose Deletes superfluous records from a data set. A superfluous record here means a record that contains information that is a strict subset of another record. A typical use is with name data, where one record has a name with a complete middle name, and other records have just the middle initial, or no initial at all. In this case, the record with the middle name has more complete information than the other records, so this macro will delete the other records. For example, say you have two records with the same person id. These records differ only in that one has a middle initial and one does not. In this case, the record without the middle initial is a subset of the record with an initial, so it is deleted. Along with DsnOut, this macro also produces the data set DsnOut_DeletedRecords, which shows which records were deleted. Parameters DsnIn: The one or two-level dataset name that is inputted. DsnOut: The one or two-level dataset name that is created. DsnOut_DeletedRecords: Superfluous records deleted from DsnIn. StaticColumns: Space delimited list of columns whose values taken together are invariant. InputColumns: Space delimited list of columns. This macro deletes records with superfluous values for each column. Debug: (Optional) Y = Don't delete intermidiate datasets. N = Delete intermidiate datasets. Default is N. Signature: %DeleteSuperfluousRecords(DsnIn=, DsnOut=, DsnOut_DeletedRecords=DeletedRecords, StaticColumns=, InputColumns=, Debug=N) Revisions: Date Version Author Comments 2013-11-25 0.91 John 1) Version 0.9 had a bug where if two records the same up to and including the last column, instead of one of these two records being deleted, another, unrelated record was deleted. This version fixes that bug. 2)Changed macro name from DeleteSuperfluousNames.sas to DeleteSuperfluousRecords.sas 3) When the debug parameter is set to Y added the list of local macro variables and their values are outputted to the log. 2014-05-06 0.92 John Before this version, the NonKeyColumn parameter was embedded in a single ColumnsIn parameter as thePulled out the NonKeyColumnIn parameter from the old ColumnsIn parameter as the last in a list of columns. Now pulled out into own NonKeyColumn parameter. Also added some upfront parameter checking. 2015-10-28 0.93 John Had problem where if the NonKeyColumnIn word was contained as a piece of one of the KeyColumnsIn, then an error resulted. Fixed this, as well as some general cleanup of temporary data set names. Added NonKeyColumnIn column to DeletedSuperfluousRecords. 2017-04-07 0.94 John Major rewrite. No more NonKeyColumnIn parameter. 2017-01-25 0.941 John In data step that creates DsnOut, moved drop operation for _RcrdID from data option to a statement. This frees up the DsnIn parameter so it can use data step options. 2022-04-11 0.942 John Test data in previous versions would not be created properly. Update code. Also, added many more comments to source code to help me figure out how the macro works. ----------------------------------------------------------------------------- */ %macro DeleteSuperfluousRecords(DsnIn=, DsnOut=, DsnOut_DeletedRecords=DeletedRecords, StaticColumns=, InputColumns=, Debug=N); %put --- Start of %upcase(&sysmacroname) macro; %if &Debug.~=N %then %put _local_; %local _IsNullParameter; /* %let ExcludeColumns=%trim(%upcase(&ExcludeColumns.));*/ %if %sysfunc(exist(%scan(&DsnIn.,1,%str(%()))) ne 1 %then %do; %put %str(ERR)OR: (&sysmacroname.) The DsnIn parameter is null; %let _IsNullParameter=DsnIn; %end; %if &_IsNullParameter.~=%then %goto exit; /* Add master record ID for entire inputted data set. */ data _DeleteSuperfluous_01; length _MasterRcrdID 8; set &DsnIn.; _MasterRcrdID = _n_ - 1; run; /* Create an empty data set of just the dynamic columns (InputColumns). */ data _DeleteSuperfluous_DynCols_01; length &InputColumns. $1; run; proc sql noprint; /* Hmmm... Tranposes the column names as field under the single column NAME. Also counts the number of rows (number of dynamic columns. The results will all be "InputColumns" that are in DsnIn. */; CREATE TABLE _DeleteSuperfluous_DynCols_02 AS SELECT DsnInCols.NAME label=' ' FROM DICTIONARY.COLUMNS DynamCols INNER JOIN DICTIONARY.COLUMNS DsnInCols ON DynamCols.Name = DsnInCols.Name WHERE DynamCols.LIBNAME = 'WORK' AND DsnInCols.LIBNAME = 'WORK' AND DynamCols.MEMNAME = "%upcase(_DeleteSuperfluous_DynCols_01)" AND DsnInCols.MEMNAME = "%upcase(_DeleteSuperfluous_01)" ORDER BY DynamCols.varnum ; %let NumDynamicCols=&sqlobs.; /* Creates two sets of macro variables: 1) All dynamic column names, each column in its own variable, 2) All dynamic column names in a single space delimited variable. Here, dynamic columns means all the columns in the InputColumns macro parameter. */ SELECT NAME AS NameIndivualVars, NAME AS NameOneVar INTO :DynamicCol1 - :DynamicCol999, :DynamicAllCols SEPARATED BY " " FROM _DeleteSuperfluous_DynCols_02 ; %let NumDynamicCols=&sqlobs.; quit; %let StaticCols=; %let NumStaticCols=0; /* Create a space-delimited macro variable of the columns listed in the parameter StaticColumns exist in DsnIn. */ %if &StaticColumns.~=%then %do; data _DeleteSuperfluous_StatCols_01; length &StaticColumns. $1; run; proc sql noprint; SELECT Static.Name INTO :StaticCols SEPARATED BY " " FROM DICTIONARY.COLUMNS Static INNER JOIN DICTIONARY.COLUMNS DsnInCols ON Static.Name = DsnInCols.Name WHERE Static.LIBNAME = 'WORK' AND DsnInCols.LIBNAME = 'WORK' AND Static.MEMNAME = "%upcase(_DeleteSuperfluous_StatCols_01)" AND DsnInCols.MEMNAME = "%upcase(_DeleteSuperfluous_01)" ; %let NumStaticCols=&sqlobs.; quit; %end; /* Create a space-delimited macro variable of static column names followed by the dynamic column names. */ %let StaticAndDyamicCols=%sysfunc(COMPBL(&StaticCols. &DynamicAllCols.)); /* %put StaticAndDyamicCols: &StaticAndDyamicCols.; */ /* Sort DsnIn with _MasterRcrdID by the static columns followed by the dynamic columns. */ proc sort data=_DeleteSuperfluous_01; by &StaticAndDyamicCols.; run; /* Assign to macro variables the first column in &StaticAndDyamicCols., which if one or more static columns is present, will be a static column. Then assing the last (dynamic) column of InputColums parameter. */ %let _UltimateColumn = %scan(&StaticAndDyamicCols, -1, %str( )); %let _FirstColumn = %scan(&StaticAndDyamicCols, 1, %str( )); /* %put _UltimateColumn: &_UltimateColumn.; %put _FirstColumn: &_FirstColumn.; */ /* Make the initial delete superfluous records pass based on DISTINCTness of the set of static and dynamic columns. Only keep the last record where the values for the static and dynamic columns are are the same. If any records come before the last one, thus superfluous, output these into _DeleteSuperfluous_deleted_02. In the NonKeyColumnIn (I'm not sure about the suitability of the name), but the reason why, DISTINCT. At this point, I'm not sure why it needs to be in with the "good" records, _DeleteSuperfluous_01. */ data _DeleteSuperfluous_01 /* (drop=NonKeyColumnIn) */ _DeleteSuperfluous_deleted_02; length NonKeyColumnIn $32 _RcrdID 8; set _DeleteSuperfluous_01; by &StaticAndDyamicCols.; NonKeyColumnIn = 'DISTINCT'; _RcrdID = _n_; if last.&_UltimateColumn. then output _DeleteSuperfluous_01; else output _DeleteSuperfluous_deleted_02; run; /* In NOT ( ( the number of dynamic columsn = 0 (i.e. parameter InputColumns has no columns, or the columns it has don't exist in DsnIn ) OR ( There is only a single dynamic column listed in parameter InputColumns that actually exists in DsnIn AND there aren't any static columns in parameter StaticColumns that actually exist in DsnIN, in which case the DISTINCT above would have done all that could be done. ) ) then walk through the dynamic columns and delete the superfluous records found. */ %if ~(&NumDynamicCols.=0 | (&NumDynamicCols.=1 & &NumStaticCols.=0)) %then %do; /* %let NumDynamicCols=1;*/ %do i = 1 %to &NumDynamicCols.; /* %let I=1; * Since the I in &&DynamicCol&I is not set while highlighting code and running during debugging, for debugging purposes, set I=1; */ %put *** Dynamic column &&DynamicCol&I. loop. ***; /* If there NumDynamicCols=1 then the query below will return 0 rows and hence KeyColumns2 would not be set. Therefore, I'm creating it first so at the very least it will exist as an empty string. Create a space delimited list of all the InputColumns except for the current &&DynamicCol&I that the do loop is engaged in processing. */ %let KeyColumns2=;; /* Default value, empty string */ proc sql noprint; SELECT Name INTO :KeyColumns2 SEPARATED BY ' ' FROM _DeleteSuperfluous_DynCols_02 WHERE Name <> "&&DynamicCol&I" ; quit; /* These are the columns for sorting: as always the StaticColumns and all dynamic columns but &&DynamicCol&I. */ %let KeyColumns=&StaticColumns. &KeyColumns2.; /* Hmmm... Seems like _UltimateColumn would always be the same, and it would be the same as the value set it was set to above this loop. */ %let _UltimateColumn = %scan(&KeyColumns, -1, %str( )); /* %put KeyColumns2: &KeyColumns2.; %put KeyColumns: &KeyColumns.; %put _UltimateColumn: &_UltimateColumn.; %put %nrstr(&&DynamicCol&I.): &&DynamicCol&I. where I=&I.; */ /* Sort by yes the KeyColumns (StaticColumns) and in each iteration, one of the DynamicCols, moving from left to right in the list. Naturally when sorted by &&DynamicCol&I, the shorter string will sort lower than a longer string that begins with the same characters as the shorter string. For example, "abc" will sort lower than "abcde" */ proc sort data=_DeleteSuperfluous_01; by &KeyColumns. &&DynamicCol&I; run; /* Assign a RcrdID based on the sort order of &&DynamicCol&I. Flag records in each */ data _DeleteSuperfluous_02; length _RcrdID 8.; set _DeleteSuperfluous_01; by &KeyColumns. &&DynamicCol&I.; retain _NonKeyColumnCount; _RcrdID = _n_; _NonKeyColumnLag = lag(&&DynamicCol&I); if first.&_UltimateColumn. then do; /* Nothing to compare first record with */ _NonKeyColumnCount = 0; end; else do; /* If lagged field is same or a part of current record, or lagged field is empty then increment _NonKeyColumnCount. BTW, In the next data step, only the sets with the highest _NonKeyColumnCount will survive. */ if strip(&&DynamicCol&I) =: strip(_NonKeyColumnLag) or strip(_NonKeyColumnLag) = '' then _NonKeyColumnCount = _NonKeyColumnCount + 1; else _NonKeyColumnCount = 0; end; run; data IC_01_&&DynamicCol&I; set _DeleteSuperfluous_02; run; /* %put &KeyColumns.; proc sort in=_DeleteSuperfluous_02 out=IC_01_Ascending; by &KeyColumns. _RcrdID; run; proc sort in=_DeleteSuperfluous_02 out=IC_02_Descending; by &KeyColumns. descending _RcrdID; run; */ proc sort data=_DeleteSuperfluous_02; by &KeyColumns. descending _RcrdID; run; data IC_02_&&DynamicCol&I; length NonKeyColumnIn $32; set _DeleteSuperfluous_02; NonKeyColumnIn = "&&DynamicCol&I"; _NonKeyColumnCountLag = lag(_NonKeyColumnCount); if _NonKeyColumnCount < _NonKeyColumnCountLag then Keep='False'; else Keep='True'; /* Comment these out if you want to see how macro works. */ /* drop _NonKeyColumnCount _NonKeyColumnLag _NonKeyColumnCount _NonKeyColumnCountLag;*/ run; data _DeleteSuperfluous_03 _DeleteSuperfluous_deleted_01; length NonKeyColumnIn $32; set _DeleteSuperfluous_02; NonKeyColumnIn = "&&DynamicCol&I"; _NonKeyColumnCountLag = lag(_NonKeyColumnCount); if _NonKeyColumnCount < _NonKeyColumnCountLag then output _DeleteSuperfluous_deleted_01; else output _DeleteSuperfluous_03; /* Comment these out if you want to see how macro works. */ drop _NonKeyColumnCount _NonKeyColumnLag _NonKeyColumnCount _NonKeyColumnCountLag; run; /* %if &&DynamicCol&I=LastName %then %do;*/ /* %put %str(ERR)OR: Terminating condition met.;*/ /* %goto exit;*/ /* %end;*/ /* %if &I.=1 %then %do;*/ /* data _DeleteSuperfluous_deleted_02;*/ /* set _DeleteSuperfluous_deleted_01;*/ /* run;*/ /* %end;*/ /* %else %do;*/ /* Append the deleted records from the &&DynamicCol&I. loop to those previously identified deleted records. */ proc datasets nolist; append base=_DeleteSuperfluous_deleted_02 data=_DeleteSuperfluous_deleted_01 force; run;quit; /* %end;*/ %if &I.<&NumDynamicCols. %then %do; data _DeleteSuperfluous_01; set _DeleteSuperfluous_03 /*(drop=NonKeyColumnIn)*/; run; %end; %end; /* i loop */ %end; /* if */ /* Sort the records so that both what will soon be DsnOut and DsnOut_DeletedRecords are in the same sort order as what came in with DsnIn. This is a niceity, so you can more easily eyeball DsnIn and how it was split up into DsnOut and DsnOut_DeletedRecords. */ proc sort data = _DeleteSuperfluous_03; by _MasterRcrdID; run; proc sort data = _DeleteSuperfluous_deleted_02; by _MasterRcrdID; run; data &DsnOut.; set _DeleteSuperfluous_03; drop _RcrdID NonKeyColumnIn %if &Debug.=N %then _MasterRcrdID;; run; data &DsnOut_DeletedRecords. (drop=_RcrdID); set _DeleteSuperfluous_deleted_02; %if &Debug.=N %then %do; drop _MasterRcrdID; %end; run; %exit: %if &Debug.=N %then %do; proc sql; DROP TABLE _DeleteSuperfluous_01, _DeleteSuperfluous_02, _DeleteSuperfluous_03, _DeleteSuperfluous_deleted_01, _DeleteSuperfluous_deleted_02, _DeleteSuperfluous_DynCols_01, _DeleteSuperfluous_DynCols_02, _Deletesuperfluous_statcols_01; quit; %end; %if &Debug.~=N %then %put _local_; %put --- End of %upcase(&sysmacroname) macro; %mend DeleteSuperfluousRecords; /* * This sample data contains three superfluous records: 1) The Karen White record that does not have a middle name. 2) The Jack Brown record that does not have a middle name. 3) The Jack Brown record that has only the middle initial "B". 4) Two Sal Orange records, one with a shorter middle name than the other two, another where is an duplicate of the other. data SampleDataMiddleName; input PersonID LastName $ FirstName $ MiddleName $ DOB $; datalines; 1 White Karen . 5/5/2005 1 White Karen M 5/5/2005 2 Brown Jack . 6/6/2006 2 Brown Jack B 6/6/2006 2 Brown Jack BJ 6/6/2006 2 Brown Jack ZY 6/6/2006 3 Orange Sal A 6/6/2006 3 Orange Sal AB 6/6/2006 3 Orange Sal AB 6/6/2006 4 Green Lisa May 7/7/2007 ; run; %DeleteSuperfluousRecords(DsnIn=SampleDataMiddleName, DsnOut=SampleDataMiddleName_Out, DsnOut_DeletedRecords=SampleDataMiddleName_Deleted, InputColumns=MiddleName, StaticColumns=PersonID DOB LastName FirstName, Debug=N) data TestData; input @1 ID 1. @5 LastName $5. @13 FirstName $6. @21 MiddleName $6. @29 BirthDate $10.; datalines; 1 Smith John Rob 1922-01-02 1 Smith Jon R 1922-01-02 1 Smith John R 1922-01-02 1 Smith John R 1922-01-02 1 Smith John Robert 1 Smith John Robert 1922-01-02 2 Smith Jill 1944-03-04 2 Jones Stuart Beof 1966-05-06 2 Jones Stua Beof 1966-05-06 2 Jones Stuart B 1966-05-06 ; run; %let DsnIn=TestData; %let DsnOut=TestData_Out; %let InputColumns=MiddleName FirstName LastName BirthDate; *let DsnOut_DeletedRecords=TestData_DeletedRecords; %let StaticColumns=ID; %let Debug=Y; %DeleteSuperfluousRecords(DsnIn=TestData, DsnOut=TestData_Out, DsnOut_DeletedRecords=TestData_DeletedRecords, InputColumns=MiddleName FirstName LastName BirthDate, StaticColumns=ID, Debug=Y) data TestData; input @1 ID 1. @5 FirstName $10.; datalines; 0 Jonathony 1 Jo 1 Jonath 1 J 1 Jhon 1 John 1 Jona 1 Jonathon 1 Jon 2 Jonathony ; run; %DeleteSuperfluousRecords(DsnIn=TestData, DsnOut=TestData_Out, DsnOut_DeletedRecords=TestData_DeletedRecords, InputColumns=FirstName, StaticColumns=ID, Debug=Y) data TestDataSSN; input ID LastName $ FirstName $ DOB $ SSN $; datalines; 1 Walker Melinda . 55555555 1 Walker Melinda 5/4/1965 . 1 Walker Melin . 55555555 1 Walk Melinda 5/4/1985 . ; run; data TestEdgeCase_1; input ID; datalines; 5 ; run; %DeleteSuperfluousRecords(DsnIn=TestEdgeCase_1, DsnOut=TestEdgeCase_1_Out, DsnOut_DeletedRecords=TestEdgeCase_1_Delete, InputColumns=ID, StaticColumns=, Debug=Y) data TestEdgeCase_2; input ID; datalines; 4 4 ; run; %DeleteSuperfluousRecords(DsnIn=TestEdgeCase_2, DsnOut=TestEdgeCase_2_Out, DsnOut_DeletedRecords=TestEdgeCase_2_Delete, InputColumns=ID, StaticColumns=, Debug=Y) %DeleteSuperfluousRecords(DsnIn=TestEdgeCase_2, DsnOut=TestEdgeCase_2a_Out, DsnOut_DeletedRecords=TestEdgeCase_2a_Delete, InputColumns=, StaticColumns=ID, Debug=Y) data TestData; input @1 ID 1. @8 FirstName $9. @17 MiddleName $10. @27 LastName $7. @34 Birthdate $10.; datalines; 1 John Rob Smith 1922-01-02 1 Jon R Smith 1922-01-02 1 John Rybert Smith 1922-01-02 1 John R Smith 1922-01-02 1 John Robert Smith 1 John R Smith 1922-01-02 2 Jill Smith 1944-03-04 3 Stuart Beof Jones 1966-05-06 3 Stua Beof Jones 1966-05-06 3 Stuart B Jones 1966-05-06 ; run; %DeleteSuperfluousRecords(DsnIn=TestData, DsnOut=TestData_Out, DsnOut_DeletedRecords=TestData_Delete, StaticColumns=ID, InputColumns=MiddleName FirstName LastName Birthdate, Debug=Y) */