ProbDup.SplitProbDup splits an object of class ProbDup into two on the
basis of set counts.
SplitProbDup(pdup, splitat = c(30, 30, 30))
| pdup | An object of class |
|---|---|
| splitat | A vector of 3 integers indicating the set count at which
Fuzzy, Phonetic and Semantic duplicate sets in |
A list with the the divided objects of class ProbDup
(pdup1 and pdup2) along with the corresponding lists of
accessions present in each (list1 and list2).
# NOT RUN { # Load PGR passport database GN <- GN1000 # Specify as a vector the database fields to be used GNfields <- c("NationalID", "CollNo", "DonorID", "OtherID1", "OtherID2") # Clean the data GN[GNfields] <- lapply(GN[GNfields], function(x) DataClean(x)) y1 <- list(c("Gujarat", "Dwarf"), c("Castle", "Cary"), c("Small", "Japan"), c("Big", "Japan"), c("Mani", "Blanco"), c("Uganda", "Erect"), c("Mota", "Company")) y2 <- c("Dark", "Light", "Small", "Improved", "Punjab", "SAM") y3 <- c("Local", "Bold", "Cary", "Mutant", "Runner", "Giant", "No.", "Bunch", "Peanut") GN[GNfields] <- lapply(GN[GNfields], function(x) MergeKW(x, y1, delim = c("space", "dash"))) GN[GNfields] <- lapply(GN[GNfields], function(x) MergePrefix(x, y2, delim = c("space", "dash"))) GN[GNfields] <- lapply(GN[GNfields], function(x) MergeSuffix(x, y3, delim = c("space", "dash"))) # Generate KWIC index GNKWIC <- KWIC(GN, GNfields) # Specify the exceptions as a vector exep <- c("A", "B", "BIG", "BOLD", "BUNCH", "C", "COMPANY", "CULTURE", "DARK", "E", "EARLY", "EC", "ERECT", "EXOTIC", "FLESH", "GROUNDNUT", "GUTHUKAI", "IMPROVED", "K", "KUTHUKADAL", "KUTHUKAI", "LARGE", "LIGHT", "LOCAL", "OF", "OVERO", "P", "PEANUT", "PURPLE", "R", "RED", "RUNNER", "S1", "SAM", "SMALL", "SPANISH", "TAN", "TYPE", "U", "VALENCIA", "VIRGINIA", "WHITE") # Specify the synsets as a list syn <- list(c("CHANDRA", "AH114"), c("TG1", "VIKRAM")) # Fetch probable duplicate sets GNdup <- ProbDup(kwic1 = GNKWIC, method = "a", excep = exep, fuzzy = TRUE, phonetic = TRUE, encoding = "primary", semantic = TRUE, syn = syn) # Split the probable duplicate sets GNdupSplit <- SplitProbDup(GNdup, splitat = c(10, 10, 10)) # }