********************************************************************************************
***DISCERN/ ASHISH ARORA, SHARON BELENZON, LIA SHEER (DUKE UNIVERSITY) / DECEMBER 2020***
********************************************************************************************
********************************************************************************************
*NPL cleaning example code
/*
We begin with a many-to-many match, allowing more than one publication to be matched to each citation.
For each possible records pair, we construct a score that captures the degree of textual overlap between the title, journal, authors and publication year.
To exclude mismatches, we use a more detailed matching algorithm that is based on different sources of publication information.
We present the detailed algorithm below. Then we proceed with a thorough manual check for false negative and false positive.
*/

*PART 1: detailed matching algorithm
*Run this for each of the raw output files:
forvalues k=1/20 {
use "Output_`k'_All", clear
*merging patent data
joinby npl_publn_id  using "npl_cited-us_clean_1980_2015.dta" 
drop publn_kind publn_date npl_type
duplicates drop

*keeping only granted utility patents
gen t=1 if regexm(publn_nr,"RE")==1
drop if t==1
drop t
gen l=length(publn_nr)
keep if l==7
drop l	

gen cite=npl_biblio
replace cite= itrim(cite)
replace cite= trim(cite)
replace cite=ustrrtrim(cite)
replace cite= ustrltrim(cite)

*dropping irrelevant npl by date
gen date=regexs(0) if (regexm(cite,"([A-Z][a-z][a-z])(\.?) ([0-9])([0-9]?), ([1-9])([0-9][0-9])([0-9])")==1)
replace date=regexs(0) if (regexm(cite,"([A-Z][a-z][a-z]\.)(,?) ([1-9])([0-9][0-9])([0-9])")==1)
replace date=regexs(2) if (regexm(cite,"(\()([1-2][0-9][0-9][0-9])(\)\.$)")==1)& date==""
replace date=regexs(2) if (regexm(cite,"(, )([1-2][0-9][0-9][0-9])(\.$)")==1) & date==""
replace date=regexs(1) if (regexm(cite,"([1-2][0-9][0-9][0-9])(, pp\.)")==1) & date==""
replace date=regexs(2) if (regexm(cite,"(\()([1-2][0-9][0-9][0-9])(\))(, pp\.)")==1) & date==""
replace date=regexs(2) if (regexm(cite,"(et al\. \()([1-2][0-9][0-9][0-9])(\))")==1) & date==""
replace date=regexs(2) if (regexm(cite,"(et al\., )([1-2][0-9][0-9][0-9])")==1) & date==""

replace date=substr(date,-4,4)
destring date, replace
drop if date>1950 & date<1980

replace cite=ustrupper(cite)
replace cite=subinstr(cite,"-"," ",.) 
replace cite=subinstr(cite,","," ",.) 
replace cite= itrim(cite)
replace cite= trim(cite)
replace cite=ustrrtrim(cite)
replace cite= ustrltrim(cite)
*dropping irrelevant npl by doc type
gen p=(regexm(cite,"U\.S\. APPL NO")|regexm(cite,"US PATENT")|regexm(cite,"PROVISIONAL APPLICATION")|regexm(cite,"U\.S\. SERIAL NO")|regexm(cite,"U\.\S. TRADEMARK")|regexm(cite,"USPTO ")|regexm(cite,"(WITHDRAWN)")|regexm(cite,"USSN")|regexm(cite,"U\.S\. PATENT")|regexm(cite,"PENDING PUBLICATION")|regexm(cite,"APPLICATION NO")|regexm(cite,"U\.S\. SER\. NO")|regexm(cite,"JAPANESE PATENT")|regexm(cite,"U\.S\. APP\. NO")|regexm(cite,"U\.S\. OFFICE ACTION MAILED")|regexm(cite,"OFFICIAL GAZETTE")|regexm(cite,"U\.S\. APPLICATION")|regexm(cite,"US PUB\. NO\.")|regexm(cite,"ABSTRACT OF")|regexm(cite,"PATENT AND TRADEMARK")|regexm(cite,"ABSTRACT NO")|regexm(cite,"SEARCH REPORT PCT")|regexm(cite,"APPL\. NO\.")|regexm(cite,"PATENT ABSTRACT")|regexm(cite,"ABSTRACT NO")|regexm(cite,"PATENTS ABSTRACT")|regexm(cite,"JP-ABSTRACT")|regexm(cite,"PATENT APPLICATION")|regexm(cite,"PCT PUBLICATION"))

gen w=(regexm(cite,"ONLINE MATERIAL")|regexm(cite,"\((ONLINE)\)")|regexm(cite,"\[(ONLINE)\]")|regexm(cite,"PUBLISHED ONLINE")|regexm(cite,"PUBLISHED ON LINE")|regexm(cite,"WEB PAGE")|regexm(cite,"(\.)COM")|regexm(cite,"RETRIVED ON")|regexm(cite,"HTTP://")|regexm(cite,"WWW\.")| regexm(cite,"WEBSITE")|regexm(cite,"WEBPAGE"))

gen d=(regexm(cite," NEWS ")|regexm(cite," IN PRESS ")|regexm(cite,"BUSINESS WIRE")|regexm(cite,"NEWYORK TIMES")|regexm(cite,"NEW YORK TIMES")|regexm(cite,"DATA SHEET")|regexm(cite,"PRODUCT SHEET")|regexm(cite,"DATASHEET")|regexm(cite," NEWS\.")|regexm(cite,"BUSINESS WEEK")|regexm(cite,"CHEMICAL ABSTRACT")|regexm(cite,"CHEM\. ABS")|regexm(cite,"WALL STREET")|regexm(cite,"US COURT ")|regexm(cite," NEWS ")|regexm(cite,"WHITE PAPER")|regexm(cite,"AGILENT PUBLICATION")|regexm(cite,"DATABASE")|regexm(cite,"PRESS RELEASE")|regexm(cite,"SERVICE MEMO")|regexm(cite,"KATALOG")|regexm(cite,"ADVERTISEMENT")|regexm(cite," DIAGRAM")|regexm(cite,"MAGAZINE")|regexm(cite," NEWS ")|regexm(cite,"GENBANK ACCESSION")|regexm(cite,"PRODUCT PROFILE")|regexm(cite,"PROGRAM INTERFACE")|regexm(cite,"PRODUCT INFORMATION")|regexm(cite,"FACT SHEET")|regexm(cite,"PUBLICITY")|regexm(cite,"NEWSLETTER")|regexm(cite,"PRODUCT LITERATURE")|regexm(cite,"BROCHURE")|regexm(cite," REPORT")|regexm(cite," CATALOG"))

gen d2=(regexm(cite,"EXHIBITS NO")|regexm(cite,"ACCESSION NO")|regexm(cite,"COURT")|regexm(cite,"COMMUNICATION OF")|regexm(cite,"COMPLAINT FOR PATENT")|regexm(cite,"COMPLAINT FILED")|regexm(cite,"POWERPOINT PRESENTATION")|regexm(cite,"PLEADING IN CIVIL")|regexm(cite,"PATENT GRANT")|regexm(cite," ARTICLE")|regexm(cite," OPINION")|regexm(cite," GUIDE")|regexm(cite," MANUAL"))

gen w2=(regexm(cite,"FOREIGN COMMUNICATION")|regexm(cite,"WHITEPAPER ")|regexm(cite,"TRANSCRIPT ")|regexm(cite,"REQUEST FOR")|regexm(cite,"OFFICIAL COMMUNICATION")|regexm(cite,"OFFICE COMMUNICATION")|regexm(cite,"BRIEF IN")|regexm(cite,"BRIEF FOR")|regexm(cite,"DECLARATION OF")|regexm(cite,"DEFENDANT")|regexm(cite,"INTELLECTUAL PROPERTY")|regexm(cite,"CIVIL ACTION NO")|regexm(cite,"TRANSLATION")|regexm(cite,"NOTICE OF")|regexm(cite,"OFFICIAL ACTION")|regexm(cite,"OFFICE ACTION")|regexm(cite,"U\.S\. APPL\. NO")|regexm(cite," GENBANK")|regexm(cite,"CERTIFIED ENGLISH TRANSLATION")|regexm(cite,"WIKIPEDIA")|regexm(cite,"RETRIEVED FROM THE INTERNET"))



drop if p==1
drop if w==1
drop if d==1
drop if d2==1
drop if w2==1
drop p w d d2 w2 cite

ren  npl_biblio patref

duplicates drop publn_nr pat_publn_id npl_publn_id pubid , force
ren pubid wos_rec
*merging pub data
merge m:m wos_rec using "Liaformat_1980_2015", keepusing (pubyear source vol page issue authors)
drop if _m==2
drop _m

ren pubyear j_date

destring j_date, replace
drop if publn_year<(j_date-2)

ren patref cite
replace cite= itrim(cite)
replace cite= trim(cite)
replace cite=ustrrtrim(cite)
replace cite= ustrltrim(cite)

/* title var to perform matches: */
gen title2=ustrupper(title)
replace title2=subinstr(title2,".","",.)
replace title2=subinstr(title2,","," ",.)
replace title2=subinstr(title2,"-"," ",.)
replace title2=subinstr(title2,":"," ",.)
replace title2=subinstr(title2,"'","",.)
replace title2=subinstr(title2,"`","",.)
replace title2=subinstr(title2,"("," ",.)
replace title2=subinstr(title2,")"," ",.)
replace title2=subinstr(title2,"?"," ",.)
replace title2=subinstr(title2,"+","",.)
replace title2=subinstr(title2,"]"," ",.)
replace title2=subinstr(title2,"["," ",.)
replace title2=subinstr(title2,"*","",.)
replace title2=subinstr(title2,"^","",.)
replace title2=subinstr(title2,"/","",.)
replace title2=subinstr(title2,"\","",.)
replace title2=subinstr(title2,";"," ",.)
replace title2=subinstr(title2,"%"," ",.)
replace title2=subinstr(title2,"PERCENT"," ",.)
replace title2=subinstr(title2,"!","",.)
replace title2=subinstr(title2,"@","",.)
replace title2=subinstr(title2,"=","",.)
replace title2=subinstr(title2,"<"," ",.)
replace title2=subinstr(title2,">"," ",.)
replace title2=subinstr(title2,"}"," ",.)
replace title2=subinstr(title2,"{"," ",.)
replace title2=subinstr(title2,"&"," ",.)
replace title2= itrim(title2)
replace title2= trim(title2)
replace title2=" " +title2+" "
replace title2=subinstr(title2," AND "," ",.)
replace title2=subinstr(title2," THE "," ",.)
replace title2=subinstr(title2," AN "," ",.)
replace title2=subinstr(title2," A "," ",.)
replace title2=subinstr(title2,"HOT PAPERS"," ",.)
replace title2=subinstr(title2," TWO "," 2 ",.)
replace title2=subinstr(title2," II "," 2 ",.)
replace title2=subinstr(title2," THREE "," 3 ",.)
replace title2=subinstr(title2," III "," 3 ",.)
replace title2=subinstr(title2," FOUR "," 4 ",.)
replace title2=subinstr(title2," IV "," 4 ",.)
replace title2=subinstr(title2," FIVE "," 5 ",.)
replace title2=subinstr(title2," V "," 5 ",.)


replace title2= itrim(title2)
replace title2= trim(title2)
replace title2=ustrrtrim(title2)
replace title2= ustrltrim(title2)
gen wct=wordcount(title2)
gen title4=word(title2,1)+ word(title2,2)+ word(title2,3)+word(title2,4)
gen title8=word(title2,1)+ word(title2,2)+ word(title2,3)+word(title2,4)+word(title2,5)+word(title2,6)+word(title2,7)+word(title2,8)

replace title2=subinstr(title2," ","",.)


*cite clean
gen cite1=ustrupper(cite)
replace cite1=subinstr(cite1 ,".","",.)
replace cite1=subinstr(cite1, ","," ",.)
replace cite1=subinstr(cite1,"-"," ",.)
replace cite1=subinstr(cite1,":"," ",.)
replace cite1=subinstr(cite1,"'","",.)
replace cite1=subinstr(cite1,"`","",.)
replace cite1=subinstr(cite1,"("," ",.)
replace cite1=subinstr(cite1,")"," ",.)
replace cite1=subinstr(cite1,"?","",.)
replace cite1=subinstr(cite1,"+","",.)
replace cite1=subinstr(cite1,"]"," ",.)
replace cite1=subinstr(cite1,"["," ",.)
replace cite1=subinstr(cite1,"*","",.)
replace cite1=subinstr(cite1,"^","",.)
replace cite1=subinstr(cite1,"/","",.)
replace cite1=subinstr(cite1,"\","",.)
replace cite1=subinstr(cite1,";"," ",.)
replace cite1=subinstr(cite1,"%"," ",.)
replace cite1=subinstr(cite1,"!","",.)
replace cite1=subinstr(cite1,"@","",.)
replace cite1=subinstr(cite1,"=","",.)
replace cite1=subinstr(cite1,">"," ",.)
replace cite1=subinstr(cite1,"<"," ",.)
replace cite1=subinstr(cite1,"{"," ",.)
replace cite1=subinstr(cite1,"}"," ",.)
replace cite1=subinstr(cite1,"&"," ",.)
replace cite1=subinstr(cite1," AND "," ",.)
gen cite4=cite1
replace cite4= itrim(cite4)
replace cite4= trim(cite4)
replace cite4=subinstr(cite4," ","",.)

replace cite1= itrim(cite1)
replace cite1= trim(cite1)
replace cite1=ustrrtrim(cite1)
replace cite1= ustrltrim(cite1)
replace cite1=" " +cite1+" "

replace cite1=subinstr(cite1," THE "," ",.)
replace cite1=subinstr(cite1," AN "," ",.)
replace cite1=subinstr(cite1," A "," ",.)
replace cite1=subinstr(cite1,"PERCENT"," ",.)
replace cite1=subinstr(cite1," TWO "," 2 ",.)
replace cite1=subinstr(cite1," II "," 2 ",.)
replace cite1=subinstr(cite1," THREE "," 3 ",.)
replace cite1=subinstr(cite1," III "," 3 ",.)
replace cite1=subinstr(cite1," FOUR "," 4 ",.)
replace cite1=subinstr(cite1," IV "," 4 ",.)
replace cite1=subinstr(cite1," FIVE "," 5 ",.)
replace cite1=subinstr(cite1," V "," 5 ",.)

replace cite1= itrim(cite1)
replace cite1= trim(cite1)
replace cite1=ustrrtrim(cite1)
replace cite1= ustrltrim(cite1)
replace cite1=subinstr(cite1," ","",.)


*year check
tostring j_date, replace
gen check_year=(regexm(cite,j_date) & j_date!="")

*page check

replace page=upper(page)
replace page=subinstr(page,"-&AMP","-000",.)
replace page=subinstr(page,"-+","-000",.)
replace page=trim(page)
replace page=itrim(page)
gen page2=subinstr(page,"-"," ",.)
gen page3=subinstr(page,"-"," to ",.)
gen page4=subinstr(page,"-"," - ",.)
gen page5=regexs(0) if (regexm(page,"([A-Z]?)([0-9]+)([A-Z]?)")==1) & strlen(page)>6 
replace page5="0000000" if page5==""
gen page6=regexr(page,"[A-Z]","")
replace page6=regexr(page6,"[A-Z]","")
replace page6=regexr(page6,"[A-Z]","")
replace page6=regexr(page6,"[A-Z]","")
gen page8=regexr(page,"-[0-9][0-9]","-") if(strlen(page)==9|strlen(page)==7)
replace page8="0000000" if page8==""


gen check_page=1 if(((regexm(cite,page)==1)& (page!=""))|((regexm(cite,page2)==1)& (page!=""))|((regexm(cite,page4)==1)& (page!=""))|((regexm(cite,page6)==1)& (page!=""))|((regexm(cite,page5)==1)& (page!=""))|((regexm(cite,page3)==1)&(page!="")))
replace check_page=0 if check_page==.

*Volume check
gen vol1=upper(vol)
replace vol1=trim(vol1)
replace vol1=itrim(vol1)
gen vol2="\("+issue+"\)"
gen vol3="VOL"+vol1
gen vol4=vol1+vol2
gen vol5=vol1+":"

gen page9=vol1+":"+page5
replace page9="0000000" if page9==""
gen check_page0=1 if(((regexm(cite,page9)==1)& (page!=""))|((regexm(cite,page8)==1)& (page!=""))|((regexm(cite,page)==1)& (page!=""))|((regexm(cite,page2)==1)& (page!=""))|((regexm(cite,page4)==1)& (page!=""))|((regexm(cite,page3)==1)&(page!="")))
replace check_page0=0 if check_page0==.


gen cite3=ustrupper(cite)
replace cite3=subinstr(cite3 ,".","",.)
replace cite3=subinstr(cite3, ","," ",.)
replace cite3=subinstr(cite3, ";"," ",.)
replace cite3=subinstr(cite3, "/"," ",.)
replace cite3= itrim(cite3)
replace cite3= trim(cite3)
replace cite3=subinstr(cite3," ","",.)

gen check_vol=1 if ((regexm(cite3,vol3)==1 & vol3!="")|(regexm(cite3,vol4)==1 & vol4!="")|(regexm(cite3,vol5)==1 & vol5!="")|(regexm(cite3,vol1)==1 & vol1!=""& check_page==1))
replace check_vol=0 if check_vol==.

*Journal check
gen source1=upper(source)
replace source1=subinstr( source1,"AMP;","",.)
replace source1=subinstr( source1,"AMP","",.)
replace source1=subinstr( source1,"-"," ",.)
replace source1=subinstr(source1 ,"."," ",.)
replace source1=subinstr(source1, ","," ",.)
replace source1=subinstr(source1,":"," ",.)
replace source1=subinstr(source1,";"," ",.)
replace source1=subinstr(source1,"'","",.)
replace source1=subinstr(source1,"`","",.)
replace source1=subinstr(source1,"("," ",.)
replace source1=subinstr(source1,")"," ",.)
replace source1=subinstr(source1,"?","",.)
replace source1=subinstr(source1,"+","",.)
replace source1=subinstr(source1,"]"," ",.)
replace source1=subinstr(source1,"["," ",.)
replace source1=subinstr(source1,"*","",.)
replace source1=subinstr(source1,"^","",.)
replace source1=subinstr(source1,"/","",.)
replace source1=subinstr(source1,"\","",.)
replace source1=subinstr(source1,"%"," ",.)
replace source1=subinstr(source1,"!","",.)
replace source1=subinstr(source1,"@","",.)
replace source1=subinstr(source1,"=","",.)
replace source1=subinstr(source1,">","",.)
replace source1=subinstr(source1,"<","",.)
replace source1=subinstr(source1,"{"," ",.)
replace source1=subinstr(source1,"}"," ",.)

replace source1 =trim( source1)
replace source1 =itrim(source1)
replace source1=subinstr(source1,"&"," ",.)
replace source1=subinstr(source1," AND "," ",.)
replace source1= itrim(source1)
replace source1= trim(source1)
gen j_name1=source1
replace  j_name1= itrim( j_name1)
replace  j_name1= trim( j_name1)
replace  j_name1=subinstr( j_name1," ","",.)

replace source1=" "+source1+" "
replace source1="IEEE" if regexm(source1," IEEE ")==1
replace source1="ACM" if regexm(source1," ACM ")==1
replace source1="ACTA" if regexm(source1," ACTA ")==1
replace source1= itrim(source1)
replace source1= trim(source1)
replace source1=regexr(source1,"^JOURNAL OF THE"," ")
replace source1=regexr(source1,"^AMERICAN JOURNAL OF"," ")
replace source1=regexr(source1,"^BRITISH JOURNAL OF"," ")
replace source1=regexr(source1,"^EUROPEAN JOURNAL OF"," ")
replace source1=regexr(source1,"^INTERNATIONAL JOURNAL OF"," ")
replace source1=regexr(source1,"^JOURNAL OF INTERVENTIONAL"," ")
replace source1=regexr(source1,"^AMERICAN REVIEW OF"," ")
replace source1=regexr(source1,"^JOURNAL OF"," ")
replace source1=regexr(source1," JOURNAL$"," ")
replace source1= itrim(source1)
replace source1= trim(source1)
replace source1=subinstr(source1," ","",.)

*journal standardized name file
merge m:m j_name1 using "journal_abbriviation.dta" , keepusing(j_abbriv1 j_abbriv2 j_abbriv3)
drop if _m==2
drop _m

gen check_journal=1 if ((regexm(cite4,source1)==1 & source1!="")|(regexm(cite4,j_abbriv1)==1& j_abbriv1!="")| (regexm(cite4,j_abbriv2)==1 & j_abbriv2!="")| (regexm(cite4,j_abbriv3)==1 & j_abbriv3!="")|(regexm(cite4,j_name1)==1 & j_name1!=""))
replace check_journal=0 if check_journal==.


***
gen t1=(check_journal==1&check_year==1)
gen t2=(check_page==1&check_year==1)
gen t3=(check_page==1&check_vol==1)
gen t4=(check_journal==1&check_vol==1)
gen t5=(check_journal==1&check_page==1)
gen t6=(check_year==1&check_vol==1)


****

*Fuzzy Title check
gen check_title=regexm(cite1,title2) if ustrlen(cite1)>=ustrlen(title2)
replace check_title=0 if check_title==.
replace check_title=0 if ((wct<4)&(t1==0&t2==0&t3==0&t4==0&t5==0&check_page0==0))
gen diff=ustrlen(cite1)-ustrlen(title2) if ustrlen(cite1)>=ustrlen(title2)
ustrdist cite1 title2  if ustrlen(cite1)>=ustrlen(title2), generate(dist1)
gen dist2=dist1-diff if ustrlen(cite1)>=ustrlen(title2)
replace check_title=1 if (dist2<5 & dist2!=. & wct>5 & (check_page==1|check_year==1|check_journal==1))
replace check_title=1 if (dist2<3 & dist2!=. & wct>4)
replace check_title=1 if (dist2<3 & dist2!=. & wct==3& (t1==1|t2==1|t3==1|t4==1|t5==1))

*check match by 4&8 first words 
replace check_title=0 if  title2==""
gen check_4wrd=1 if (regexm(cite1,title4)==1 &wct>2 & title4!="")
gen check_8wrd=1 if (regexm(cite1,title8)==1 &wct>7 & title8!="")
replace check_4wrd=0 if check_4wrd==.
replace check_8wrd=0 if check_8wrd==.
replace check_title=1 if (check_4wrd==1 &(check_page0==1))
replace check_title=1 if (check_8wrd==1 &(t1==1|t4==1|check_page==1))



*fuzzy Authors check 
split authors, p("1")
rename (authors) (auth)

/*number of name vars generated by split*/
egen anonmiss = rownonmiss ( authors* ) , strok 


foreach var of varlist authors* {
di "`var'" 
replace `var'=subinstr(`var',","," ",.) 
replace `var'=subinstr(`var',";"," ",.) 
replace `var'=subinstr(`var',"."," ",.) 
replace `var'=subinstr(`var',"'","",.) 
replace `var'=subinstr(`var',"`","",.) 
replace `var'=trim(`var') 
replace `var'=itrim(`var') 
replace `var'=ustrupper(`var') 
replace `var'="" if (wordcount(`var')==1 & ustrlen(`var')<3) 
replace `var'="" if (wordcount(`var')==2 & ustrlen(word(`var', 1))<3) 
}
egen anonmiss2 = rownonmiss ( authors* ) , strok 

/* cite var to perform matches: */
gen cite2=ustrupper(cite)
replace cite2=subinstr(cite2 ,"."," ",.)
replace cite2=subinstr(cite2, ","," ",.)
replace cite2=subinstr(cite2, ";"," ",.)
replace cite2=subinstr(cite2, "-"," ",.)
replace cite2=subinstr(cite2, "'","",.)
replace cite2=subinstr(cite2, "`","",.)
replace cite2= itrim(cite2)
replace cite2= trim(cite2)
gen l1=1 if (regexm(cite2," ET AL")==1 |regexm(cite2," AT AL")==1)
replace l1=0 if l1==.
replace cite2= regexs(0) if (regexm(cite2,"^((.)*) ET AL")==1 )
replace cite2= regexs(0) if (regexm(cite2,"^((.)*) AT AL")==1 )
replace cite2= itrim(cite2)
replace cite2= trim(cite2)
replace cite2=subinstr(cite2," ","",.)


foreach var of varlist authors*{
di "`var'"
local w1=`"word(`var',1)"'
local w2=`"word(`var',2)"'
local w3=`"word(`var',3)"'
local wc=`"wordcount(`var')"'
local b1= `"substr(substr(`w1',strpos(`w1',"-"),.),2,.)"'
local b2= `"substr(`w1',1,(strpos(`w1',"-")-1))"'
local b3=`"`w1'+"ETAL""'
local b4=`"`w2'+"ETAL""'
local v2=`"`w1'+`w2'"'
local v3=`"`w2'+`w1'"'
local v4=`"`w1'+substr(`w2',1,1)"'
local v5=`"substr(`w2',1,1)+`w1'"'
local v6=`"`w1'+`w2'+`w3'"'
local v7=`"`w2'+`w3'+`w1'"'
local v8=`"`w3'+`w1'+`w2'"'
local v9=`"`w2'+`w1'+`w3'"'
local v10=`"`w1'+`w2'+substr(`w3',1,1)"'
gen t_`var'_low=1 if ((((regexm(cite2,`w1')==1)& (ustrlen(`w1')>2) &(`wc'<3))|((regexm(cite2,`b1')==1)&(ustrlen(`b1')>2)&(regexm(`w1',"-")==1))|((regexm(cite2,`b2')==1) & (ustrlen(`b2')>2)&(regexm(`w1',"-")==1))|((regexm(cite2,`w2')==1) & (ustrlen(`w2')>2) & (`wc'==3))) & (`wc'>0))
gen t_`var'_med=1 if ((((regexm(cite2,`v2')==1) &(`wc'<4))|((regexm(cite2,`v3')==1) &(`wc'<4))|((regexm(cite2,`v4')==1) &(`wc'<3))|((regexm(cite2,`v5')==1) & (`wc'<3))|((regexm(cite2,`b3')==1) & (`wc'<3)  & (anonmiss!=1))|((regexm(cite2,`b4')==1) & (`wc'==3) & (anonmiss!=1))) & (`wc'>0)) 
gen t_`var'_high=1 if (((regexm(cite2,`v6')==1 )|(regexm(cite2,`v7')==1)|(regexm(cite2,`v8')==1)|(regexm(cite2,`v9')==1)|(regexm(cite2,`v10')==1)|(regexm(cite2,`w3')==1 &(`wc'==4)& (ustrlen(`w3')>2)))& (`wc'>0))
local w* v* b* ""
}
***********

foreach var of varlist authors*{
di "`var'"
egen sum_`var'=rowtotal(t_`var'*)
replace sum_`var'=. if sum_`var'==0
}
/*number of name matches*/
egen snonmiss = rownonmiss ( sum_authors* ) 
egen  rowtotalsa= rowtotal(sum_authors*)

save "Output_`k'_relevant_long_clean", replace
clear
}



*Part 1: for each of the cleaned output file run this:
*we combine the match results for the different features(title, authors and journal information)according to their relative importance, in order to determine a final match. 
use "Output_1_relevant_long_clean", clear

gen date2=regexs(0) if (regexm(cite,"[1][9][6-7][0-9]")==1)
gen c=1 if ((regexm(cite,"[1][9][6-7][0-9]-")==1)|(regexm(cite,"-[1][9][6-7][0-9]")==1))
destring date2, replace
drop if date2<1980 & c!=1
drop c 
destring j_date, replace
drop if date<j_date-2 &date>=1970 & date!=.
drop if date>j_date+2 & date<=2015 & date!=.


egen total_check=rowtotal(check_*)
gen final="drop" if rowtotalsa==0 
replace final="drop" if total_check==0
replace final="drop" if check_title==0 

replace final="drop" if (anonmiss==1 & (sum_authors1!=0) & (l1==1))
replace final="drop" if (wordcount(cite))<4

replace final="drop" if ((l1==0) & (anonmiss>2)& ((snonmiss/anonmiss2)<0.6)& (dist2>4&check_8wrd==0) )


replace final="" if (rowtotalsa==0 & (l1==0)&(check_title==1)& (check_page==1|check_year==1|check_journal==1|check_vol==1))
replace final="" if ((rowtotalsa==0 )& (l1==1)&(check_title==1) & (t1==1|t2==1|t3==1|t4==1|t5==1|t6==1))
replace final="" if (rowtotalsa==0 & (l1==1)&(check_title==1) & dist2<3 &wct>5 & (check_page==1|check_year==1|check_journal==1|check_vol==1))

replace final="" if rowtotalsa>0 &check_page==1 & check_journal==1& check_vol==1
replace final="" if rowtotalsa>0  & check_year==1 & check_page==1& check_vol==1
replace final="" if rowtotalsa>0  & check_page==1 & check_journal==1& check_year==1
replace final="" if rowtotalsa>0  & check_page0==1 & (check_journal==1| check_year==1|check_vol==1)

replace final="" if (rowtotalsa>0 & dist2<3 & (t1==1|t2==1|t3==1|t4==1|t5==1|t6==1) & wct>3 )


replace final="" if (check_page0==1&check_year==1&check_journal==1&check_vol==1)


gen page7=substr(page,1,2) if strlen(page)==6
replace page7=substr(page,1,2) if strlen(page)==5
replace page7="0000000" if page7==""
gen page10=":"+page7
replace page7=" "+page7
gen check_page2=1 if(check_year==1&(regexm(cite,page7)==1|regexm(cite,page10)==1)& (page!=""))
replace check_page2=0 if check_page2==.

replace final="" if (rowtotalsa>0  & check_page2==1 &check_title==1 & check_year==1)

*cases were the focal publication is a response to the original publication - tend to mismatch
gen check=1 if ((regexm(title2,"REPLYTOEDITOR")==1)|(regexm(title2,"COMMENT$")==1)|(regexm(title2,"COMMENTS$")==1)|(regexm(title2,"REPLY$")==1)|(regexm(title2,"RESPONSE$")==1)|(regexm(title2,"INVITEDCRITIQUE$")==1)|(regexm(title2,"POINTOFVIEW$")==1)|(regexm(title2,"COMMENTARY$")==1))
replace check=0 if check==.

gen check2=1 if ((regexm(cite2,"REPLYTOEDITOR")==1)|(regexm(cite2,"COMMENT")==1)|(regexm(cite2,"COMMENTS")==1)|(regexm(cite2,"REPLY")==1)|(regexm(cite2,"RESPONSE")==1)|(regexm(cite2,"INVITEDCRITIQUE")==1)|(regexm(cite2,"POINTOFVIEW")==1)|(regexm(cite2,"COMMENTARY")==1))
replace check2=0 if check2==.

replace final="drop" if (check==1 & rowtotalsa==0)
replace final="drop" if (check==1 &rowtotalsa>0 & check_page==0 & check2==0)


gen manual=.
gen final3=(final!="drop")
bysort  pat_publn_id npl_publn_id: egen max_f=max(final3)
sort cite

gen check3=(check_page2==0 &check_page==0&check_journal==0&check_vol==0& final=="")

gen check4=(check_page==1&check_year==1&check_journal==1&check_vol==1& final=="drop")

*two number pages
gen check5=(rowtotalsa>0  & check_page2==1 & check_page==0 & check_year==1 & final=="drop" & (check_journal==1&check_vol==1))
sort cite
order final manual   check check2 check3 check4 check5, first
save "Output_1_relevant_long_clean", replace

**********************************************************************8
*PART 2: MANUAL CHEACKS : A) False Negative B)False Positive
***manually check NPL reference & matched pub for observations that comply with restrictions below and manually change "final" and "manual" varibles accordingly: ********************

use "Output_1_relevant_long_clean", clear
sort cite

*************************** PART A: False Negative CheckS:  in case of a match replace final=""   **************************************
*rowtotalsa>0  & check_page2==1 & check_page==0 & check_year==1 & final=="drop" & (check_journal==1|check_vol==1)& max_f==0  & check==0 

****author matched and very similar title
* max_f==0& check==0 & final=="drop" & check_4wrd==1 &rank>1.5 & manual==.

*(rowtotalsa>0  &dist2<3 & (t1==1|t2==1|t3==1|t4==1|t5==1) & wct>=3 & max_f==0 & check==0 )
*(rowtotalsa>0  &dist2<3 & (check_journal==1|check_vol==1|check_year==1|check_page==1) & wct>3 & max_f==0 & check==0 )
*(rowtotalsa>0  &dist2==0 & (t1==1|t2==1|t3==1|t4==1|t5==1|t6==1|check_page==1|check_page2==1) & wct<4 & max_f==0 &  check==0 )
*( max_f==0& check==0  &dist2<4& rowtotal>0 &wct>3)& manual==.
*( max_f==0& check==0 &dist2<2& rowtotal>0 &wct<=3)& manual==.

****locate short names with matches
*(anonmiss2!=anonmiss  & max_f==0& check==0 & final=="drop" & rowtotal==0&check_4wrd==1&(lcite2==6|lcite2==7))
*anonmiss2!=anonmiss  & max_f==0& check==0 & final=="drop" & rowtotal==0&dist2<4&lcite2>=8&lcite2<15

****only title match exactly:

*dist2==0 & max_f==0& check==0 & final=="drop" &wct>9
* dist2==0 & max_f==0& check==0 & final=="drop" &wct>5 & wct<8
* dist2!=0&dist2<3 & max_f==0& check==0 & final=="drop" &wct>=9


****long titles &author matched:
*max_f==0& check==0 & final=="drop"&check_8wrd==1& rowtotalsa>0&wct>7& (check_jou|check_page0==1|check_vol|check_year==1)
*max_f==0& check==0 & final=="drop"&check_8wrd==1& rowtotalsa>0&wct>7& manual==.
*max_f==0& check==0 & final=="drop"&check_4wrd==1& rowtotalsa>0& (t4==1|t1==1|check_page0==1|check_title==1)& manual==.

*!!!using RANK !!! check rank value threshold based on specific match
*max_f==0& check==0 & final=="drop"&check_4wrd==1& rowtotalsa>0&rank>0.35 &manual==.
*max_f==0& check==0 & final=="drop"&check_4wrd==1& rowtotalsa>0 &rank>0.35 &check_year==1 &manual==.
*max_f==0& check==0 & final=="drop"&dist2<5& rowtotalsa>0 &rank>0.35 &manual==.
*max_f==0& check==0 & final=="drop"&dist2<5& rowtotalsa>0 &rank>0.15 &manual==.
*max_f==0& check==0 & final=="drop"&dist2<10& rowtotalsa>0 &rank>0.2 & total_check>2 &manual==.
*max_f==0& check==0 & final=="drop"&dist2<10& rowtotalsa>0 &rank>0.2 &wct>12 &manual==.

*max_f==0& check==0 & final=="drop" & rowtotalsa>0 &rank>0.40 &manual==.
*max_f==0& check==0 & final=="drop"  &rowtotalsa>0& check_page==1&check_year==1 & rank>0.3 &manual==.
*max_f==0& check==0 & final=="drop"  &rowtotalsa>0& check_page0==1& rank>0.3 &manual==.
*max_f==0& check==0 & final=="drop"  & rank>0.25& rowtotalsa>0 &total_check>2 &manual==.
*max_f==0& check==0 & final=="drop"  & rank>0.35 &total_check>2 &manual==.
*max_f==0& check==0 & final=="drop" & rowtotalsa>0 &(check_page0==1)& (check_vol==1|check_year==1|check_journal==1)&manual==.
*max_f==0& check==0 & final=="drop"& dist2<7&total_check>=2&rowtotalsa>0&wct>3 &manual==.


*max_f==0& check==0 & final=="drop" & rowtotalsa>0& rowtotalsa>0 &check_page==1&(check_vol==1|check_year==1|check_journal==1)&manual==.
*max_f==0& check==0 & final=="drop" & rowtotalsa>0 &check_page0==1 &manual==.

* max_f==0& check==0 & final=="drop" & rowtotalsa>0 &check_page==1&dist2<15 &manual==.
* max_f==0& check==0 & final=="drop" & rowtotalsa>0& wct<4 &rowtotalsa>0 &check_page==1 &manual==.
* max_f==0& check==0 & final=="drop" & rowtotalsa>0& wct<4 &rowtotalsa>0 &(t1==1|t2==1|t3==1|t4==1|t5==1)&manual==.
* max_f==0& check==0 & final=="drop" & rowtotalsa>0& wct<4 &total_check>2 &manual==.
* max_f==0& check==0 & final=="drop"& dist2<=9&total_check>=2&rowtotalsa>0&wct>3&check_vol==1 &manual==.
* max_f==0& check==0 & final=="drop"  & rank>0.25 & rowtotalsa>0  &manual==.
* max_f==0& check==0 & final=="drop"  &rank>0.3 &manual==.
* max_f==0& check==0 & final=="drop"&check_4wrd==1& rowtotalsa>0& check_journal==1&manual==.
* max_f==0& check==0 & final=="drop" & wct<4 &dist2<2 & rowtotal>0 &manual==.
* max_f==0& check==0 & final=="drop" &wct<4 & dist2<2&manual==.
* max_f==0& check==0 & final=="drop"  &manual==.
* max_f==0& check==0 & final=="drop"  &manual==.&dist2<10&rowtotalsa>0&check_4wrd==1
* max_f==0& check==0 & final=="drop" & manual==.&dist2<10&rowtotalsa>0& total_check>0
* max_f==0& check==0 & final=="drop"  &manual==.&dist2<10&check_page2==1
* max_f==0& check==0 & final=="drop"  &manual==.&dist2<10 & rank>0.3
* max_f==0& check==0 & final=="drop"  &manual==.&rowtotalsa>0&dist2<13
* max_f==0& check==0 & final=="drop"  &manual==.&rowtotalsa>0&check_4wrd
* max_f==0& check==0 & final=="drop"  &manual==.&check_4wrd&rowtotalsa==0


*final=="drop"&manual==.&rowtotalsa>0 &check_4wrd==0 & (check_page0==1&(check_year==1|check_journal==1))
*check_page0==0 & final=="drop"&manual==.&rowtotalsa>0 &check_4wrd==1 & check_year==1 &check_page0==0&check_journal==0 &check_title==1 
*final=="drop" &manual==.&rowtotalsa>0 &check_4wrd==0 & (check_page0==0& check_page==1&check_journal==1&check_year==1)


***************************PART B: False Positive checks: in case of mismatch replace final=="drop"
gen lcite= ustrlen(cite)
sort  cite
gen wtitle= wordcount(title)

*final==""  &manual==.&rowtotalsa==0&check_4wrd==0&rank<0.3
*final==""  &manual==.&rowtotalsa==0&check_4wrd==0&rank<1.3&check_page==0&check_page2==0
*final==""  &manual==.&rowtotalsa==0&check_4wrd==0&rank<0.3 &wtitle<5
*final==""  &manual==.&rowtotalsa==0&check_4wrd==0&rank<0.3 &wtitle<6
*final==""  &manual==.&rowtotalsa==0&check_4wrd==0&rank<0.2 &wtitle<7
*final==""  & manual==.&rowtotalsa==0&check_4wrd==0 & (check_page==1&check_year==1&check_journal==1)&lcite<=90
*final=="" & check_page0==0 & check_page==0&rowtotalsa==0&rank<2&manual==.&check_page==0&check_page2==0&rowtotalsa==0
*final==""  &manual==.&check_4wrd==0 & rank<0.3 &rowtotalsa>0
*final==""  &manual==.&check_4wrd==0 & rank<0.3 &rowtotalsa>0& wtitle<8 
*final=="" & check_page0==0 & check_page==0&rowtotalsa==0&rank<0.1&manual==.
*final=="" & check_page0==0 & check_page==0&rowtotalsa==0&rank<0.2&manual==.& anonmiss<5
*final=="" & manual==.&rowtotalsa==0&check_4wrd==0 & (check_page==1&check_page0==0&check_year==1)

*manual==.&rowtotalsa>0 &check_4wrd==0 & (check_page0==0& check_page==1&check_journal==1&check_year==1)&rank<0.1
*final==""  &manual==.&check_4wrd==0 & rank<0.6 &rowtotalsa>0
*final==""  &manual==.&check_4wrd==0 & rank<0.7 &rowtotalsa>0

save "Output_1_relevant_long_clean", replace

****************************short version

use "Output_1_relevant_long_clean", clear
drop if final=="drop"
keep  cite  title auth journal publn_year j_date  rank  wos_rec pat_publn_id appln_id npl_publn_id publn_nr source vol page issue   
duplicates drop
duplicates drop pat_publn_id npl_publn_id wos_rec publn_nr, force

*cleaning dup in terms of pubid patent (due to similar pat ref text under the same patent - same pub matched to both. keep highest rank)
duplicates list  wos_rec publn_nr
gsort wos_rec publn_nr -rank
duplicates drop wos_rec publn_nr, force

*cleaning dup in terms of npl_publn_id (diff pub matched to NPL-keep best pub match by highest rank)
gsort publn_nr npl_publn_id pat_publn_id -rank
duplicates list publn_nr npl_publn_id pat_publn_id

duplicates drop publn_nr npl_publn_id pat_publn_id, force
gen pubid=substr( wos_rec, 5,.)

save "NPL_output_matched_short_1", replace

