# Buddhist Sanskrit Segmenter and Lemmatiser functions # by Ligeia Lugli, licence CC-BY-SA-4.0, funded by the British Academy through a Newton International Fellowship (NF161436) ### CONTENTS: ## Intro ## Load Data ## Segmenter ## Lemmatizer ## Evaluation ## Ngrams extraction functions ## 1. INTRO # developed under R version 3.6.0 # For backround, explanation and examples see powerpoint and article associated with this file. # To use the functions you need (1) a directory containing UTF-8 plain text files with the texts to be processed (2) a stemmed wordlist, (3) a non-stemmed wordlist, (4) ngrams and (5) stems frequency lists derived from a corpus comparable to the type of literature you intend to process; (6) R (https://cran.r-project.org/) # You can list the Wordlists and frequency lists provided in this repository or you can create your own. # to create your own see below functions for ngram extraction # to use the ngram provided you can choose from a small set of ngrams derived from 5 texts: Saddharmapuṇḍarīka, Kāśyapaparivarta, Daśabhūmika, Bodhisattvabhūmi and Abhidharmakośa # or you can choose larger ngram sets derived from GRETIL reporistory of Buddhist philosophical texts or Buddhist religious texts. The larger the ngram set the longer the processing time. # The provided set for stem frequency (ReferenceTokensDF) is derived from 5 texts: Saddharmapuṇḍarīka, Kāśyapaparivarta, Daśabhūmika, Bodhisattvabhūmi and Abhidharmakośa # Warning ! this version of the segmenter replaces avagraha with "a" ! #### 2. LOAD DATA ## LOAD REQUIRED DATA # dowload data and store them in your working directory ## the data provided are calibrated for Buddhist Sutra and śāstra from around I-V CE. it is recommended that you use your own data for other types of literature. # (uncomment or add lines to load other datasets) Wordlist <- read.csv("./Lugli_SanskritWordlist2019.csv") # Stemmed wordlist derived from Monier Williams, Edgerton and (for verbs only) StarDict. BuddhFoundationalCorpusNgrams <- read.csv("./Lugli_BuddhFoundCorpusNgramsRedux.csv") # lighter dataset based only on a dozen sutras, Bodhisattvabhumi and the works of Nagarjuna, all pre-dating the IV century. performs well on most sutra and 'lexically conservative' śāstras # GretilBuddhRelitNgramsRedux <- read.csv("./Lugli_GretilBuddhRelLit_NgramsRedux.csv") # heavier data set based on the entire 'Buddhist religious literature' section of gretil. all periods inlcuded, excludes śāstras # GretilBuddhSastra_NgramsRedux <- read.csv("./Lugli_GretilBuddhSastraSastra_NgramsRedux.csv")# data set based on the entire 'Buddhist philosophy' section of gretil. all periods inlcuded, but ONLY śāstras. FiveTextsTokensWithFlagFreq_DF <- read.csv("./Lugli_FiveTextsSegmentedTokensDFWithCleanFreq.csv") # lighter dataset based on only 5 texts: Abhidharmakośa, Bodhisattvabhumi, Daśabhūmika, Kaśyapaparivarta and Saddharmapundarika NonStemmedWordlist <- read.csv("./Lugli_NonStemmedWordlist.csv") # WordlistNoA <- read.csv("./Lugli_WordlistNoA_June2019.csv") # to use with diplomatic editions that do not insert avagraha ## REQUIRED PACKAGES ### you need to have the following packages installed before using the Segmenter and Lemmatiser functions: #readtext #stringr #tokenizers #dplyr #tidyverse #rlist ### 3. SEGMENTER FUNCTION # to segment on your own text using the provided data: # download the provided Wordlist, ngrams and stem frequency, fill the path to file where needed and run the scripts below Segmenter <- function(directory, Wordlist, CorpusNgrams, ReferenceTokensDF){ TextFiles <- dir(directory) # load packages required for most operations in this project library(readtext) library(stringr) library(tokenizers) library(dplyr) library(tidyverse) ############### segmentation functions: # ngram extraction # load my stemmed wordlist and my ngram functions: ExtractNgramsTok <- function(text, StemmedWordlist,TextName){ # StemmedWordlist MUST be csv and have 1 col called LEMMATA. Text <- text$text # TextName <- deparse(substitute(text)) AllNgramsTok <- tokenize_character_shingles(Text,n=19,n_min=3, strip_non_alphanum = FALSE) AllNgramsTable <- table(AllNgramsTok) AllNgrams <- as.data.frame(AllNgramsTable) colnames(AllNgrams) <- c("ngrams", "freq") # write.table(as.data.frame(AllNgrams),file= paste(TextName,"ngram=2-19", sep=""), quote=F,sep=",",row.names=F) LemmatisedNgrams <- intersect(AllNgrams$ngrams, StemmedWordlist$LEMMATA) #write.table(as.data.frame(LemmatisedNgrams),file= paste(TextName,"ngram=2-19 pseudoLEMMATISED", sep=""), quote=F,sep=",",row.names=F) LemmatisedNgramsFreqDF <- AllNgrams[AllNgrams$ngrams %in% LemmatisedNgrams,] #write.table(as.data.frame(LemmatisedNgramsFreqDF),file= paste(TextName,"pseudoLemNgramsFreq n=2-19", sep=""), quote=F,sep=",",row.names=F) # system("say Done") return(LemmatisedNgramsFreqDF) } CorpusCleaner_ngramPrep <- function(romanisedText,TextName){ #TextName <- deparse(substitute(romanisedText)) romanisedText <- str_replace_all(romanisedText, "'", "a") romanisedText <- gsub("<.*?>", "", romanisedText) romanisedText <- gsub("\\d", "", romanisedText) romanisedText <- gsub("\\[", "", romanisedText) romanisedText <- gsub("\\]", "", romanisedText) romanisedText <- gsub("\\(", "", romanisedText) romanisedText <- gsub("\\)", "", romanisedText) romanisedText <- gsub("\\{", "", romanisedText) romanisedText <- gsub("\\}", "", romanisedText) romanisedText <- gsub("\\+", "", romanisedText) romanisedText <- gsub("\n", "", romanisedText) romanisedText <- gsub("\\-", "", romanisedText) romanisedText <- gsub("\\_", "", romanisedText) romanisedText <- gsub("\\=", "", romanisedText) romanisedText <- gsub("\\|", "", romanisedText) romanisedText <- gsub("\\\\", "", romanisedText) romanisedText <- gsub("\\\\", "", romanisedText) romanisedText <- gsub("<", "", romanisedText) romanisedText <- gsub(">", "", romanisedText) romanisedText <- gsub("\\?", "", romanisedText) romanisedText <- gsub("\\!", "", romanisedText) romanisedText <- gsub("\\#", "", romanisedText) romanisedText <- gsub("\\.", "", romanisedText) romanisedText <- gsub("\\,", "", romanisedText) romanisedText <- gsub("\\:", "", romanisedText) romanisedText <- gsub("\\;", "", romanisedText) romanisedText <- gsub('"', "", romanisedText) romanisedText <- gsub('\\&', "", romanisedText) romanisedText <- gsub('\\*', "", romanisedText) romanisedText <- gsub('\\$', "", romanisedText) romanisedText <- gsub('\\%', "", romanisedText) romanisedText <- gsub('\\^', "", romanisedText) romanisedText <- gsub('\\@', "", romanisedText) romanisedText <- gsub('\\£', "", romanisedText) romanisedText <- gsub('\\€', "", romanisedText) romanisedText <- gsub('//', "", romanisedText) romanisedText <- gsub('/', "", romanisedText) write.table(as.data.frame((romanisedText)),file= paste(TextName,"CLEAN_for_Ngrams.txt", sep=""), quote=F,sep=",",row.names=F) return(romanisedText) } TextCleanerForTokenization <- function(Text,TextName){ # Textname <- deparse(substitute(Text)) Text <- tolower(Text) Text <- gsub("eṣ([uv])", "_eṣ\\1_", Text, perl=TRUE) Text <- gsub("(<.*?>)", "\\U\\1", Text, perl=TRUE) Text <- gsub("'", "a", Text) Text <- gsub("’", "a", Text) Text <- gsub('"', ' " ', Text) Text <- gsub("\\|", "/", Text) # # to avoid mis-flagging & interference with regex patterns Text <- gsub("\\.", " . ", Text) # to avoid mis-flagging & interference with regex patterns Text <- gsub("\\?", " ? ", Text) # # to avoid mis-flagging & interference with regex patterns Text <- gsub("!", " ! ", Text) # # to avoid mis-flagging & interference with regex patterns Text <- gsub(",", " , ", Text) # # to avoid mis-flagging & interference with regex Text <- gsub(":", " : ", Text) # # to avoid mis-flagging & interference with regex patterns Text <- gsub(";", " ; ", Text) # # to avoid mis-flagging & interference with regex patterns Text <- gsub("-", " — ", Text) # # to avoid mis-flagging & interference with regex patterns Text <- gsub("\\[", "", Text) Text <- gsub("\\]", "", Text) Text <- gsub("\\(", "", Text) Text <- gsub("\\)", "", Text) Text <- gsub("%", "", Text) Text <- gsub("#", "", Text) Text <- gsub("$", "", Text) Text <- gsub("^", "", Text) Text <- gsub("@", "", Text) Text <- gsub('"', " % ", Text) #this will avoid problems with " during unsegmentation. Reintroduce " at the end of processing for correct XML! #write.table(as.data.frame((Text)),file= paste(Textname,"CLEAN_for_tokenization.txt", sep=""), quote=F,sep=",",row.names=F) return(Text) } # added uṣu separation inside function PseudoTokenizer <- function(StreamlidedNgrams_WithCorpusFreq, NgrammedText,TextName){ #StreamlidedNgrams_WithCorpusFreq is a DF with the ngrams+freq of the text to be tokenised # + the freq of those ngrams in the wider context of the text to be tokenised # (e.g if you tokenise a paragraph it will be the chapter it comes from) StreamlidedNgrams_WithCorpusFreq <- StreamlidedNgrams_WithCorpusFreq[order(nchar(as.character(StreamlidedNgrams_WithCorpusFreq$ngrams)),decreasing = TRUE), ] #print(StreamlidedNgrams_WithCorpusFreq) StreamlidedNgrams_WithCorpusFreq$nchar <- nchar(as.character(StreamlidedNgrams_WithCorpusFreq$ngrams)) # prioritises longer ngrams StreamlidedNgrams_WithCorpusFreq$nchar <- factor(StreamlidedNgrams_WithCorpusFreq$nchar) for (i in levels(StreamlidedNgrams_WithCorpusFreq$nchar)){ # FOR ngrams of equal length SameNcharDF <- StreamlidedNgrams_WithCorpusFreq[StreamlidedNgrams_WithCorpusFreq$nchar== i ,] SameNcharDF <- SameNcharDF[order(nchar(as.character(SameNcharDF$ngrams)),decreasing = TRUE), ] if (nrow(StreamlidedNgrams_WithCorpusFreq[StreamlidedNgrams_WithCorpusFreq$nchar== i ,]) >1){ # equal nchar and differen freq => prioritise higher freq OrderedSameNcharDF <- SameNcharDF[order(SameNcharDF$WeightedFreq, decreasing = TRUE),] SameNcharDF <- OrderedSameNcharDF #print("First For Loop: in this iteration i and SameNchar are: ") #print(i) #print(SameNcharDF) } for (f in SameNcharDF$WeightedFreq){ #for ngrams of eqaul length AND equal freq in the text to be tokenized SameNcharDF <- SameNcharDF[order(nchar(as.character(SameNcharDF$ngrams)),decreasing = TRUE), ] if (nrow(SameNcharDF[SameNcharDF$WeightedFreq == f ,]) >1) { OrderedSameNcharDF <- SameNcharDF[order(SameNcharDF$WeightedFreq2, decreasing = TRUE),] SameNcharDF <- OrderedSameNcharDF } } StreamlidedNgrams_WithCorpusFreq <- rbind(SameNcharDF, StreamlidedNgrams_WithCorpusFreq) StreamlidedNgrams_WithCorpusFreq <- unique(StreamlidedNgrams_WithCorpusFreq) #print(SameNcharDF) } #print("** these are the Ngrams used for tokenizing the text **") #print(StreamlidedNgrams_WithCorpusFreq) #StreamlidedNgrams_WithCorpusFreq <- StreamlidedNgrams_WithCorpusFreq[StreamlidedNgrams_WithCorpusFreq$contextFreq > 0,] #Text <- NgrammedText Text <- gsub("(<.*?>)", "\\U\\1", NgrammedText, ignore.case=FALSE, perl=TRUE) for (i in StreamlidedNgrams_WithCorpusFreq$ngrams){ TokenisedText <- str_replace_all(Text, i, paste0("-" ,str_to_upper(i), "-")) #print(i) #print(TokenisedText) # TokenisedText <- str_replace_all(Text, i, paste0("-",i,"-")) Text <- TokenisedText #print(Text) } #TextName = deparse(substitute(NgrammedText)) # write(Text, file= paste0(TextName,"_Tokenized.txt")) # system("say Segmented") return(Text) } ## USE THIS TokenizedCleaner1 <- function(tokenisedText,TextName){ #tokenisedTextname <- deparse(substitute(tokenisedText)) tokenisedText <- gsub('"śās tra"', '"śāstra"', tokenisedText) tokenisedText <- gsub("_", "", tokenisedText) # this is to remove the underscores you introduced to separate eṣu/eṣv tokenisedText <- gsub("A-sy-Ā", "-asy-Ā", tokenisedText) tokenisedText <- gsub("an-YĀN-([yi])", "-ANY-ān\\1", tokenisedText) tokenisedText <- gsub(" cānye ", " c-ĀNY-e ", tokenisedText) tokenisedText <- gsub("-YĀN- ", "yān ", tokenisedText) tokenisedText <- gsub("ya-DAS-au", "yad asau", tokenisedText) tokenisedText <- gsub(" t-VĀM-", " tvām ", tokenisedText) tokenisedText <- gsub(" t-VAD-", " tvad ", tokenisedText) tokenisedText <- gsub(" t-VAY-", " tvay", tokenisedText) tokenisedText <- gsub(" t-VASY-", " tvasy", tokenisedText) tokenisedText <- gsub("t-ARA([ṂṄ])GAM-", "-TARA\\1g-am", tokenisedText) tokenisedText <- gsub("-([iīa]t)-(V[AĀ]D)-(?![aāe])", "-\\1\\L\\2", tokenisedText,perl=TRUE) tokenisedText <- gsub("-AITADAVOCAT-", "aitad -AVOCAT- ", tokenisedText) tokenisedText <- gsub("(A?Y)-(o[ḥrśs][ |-])", "-\\L\\1\\2", tokenisedText, perl=TRUE) tokenisedText <- gsub("-TAMEN-a", "tam ena", tokenisedText) tokenisedText <- gsub("ta-MANY-", "tam-ANY-", tokenisedText) tokenisedText <- gsub("dve", "-DV-e", tokenisedText) tokenisedText <- gsub("-([ia])-KĀT-", "-\\1kāt", tokenisedText) tokenisedText <- gsub("-([ia])-KAMIT-([iyī])", "-\\1kam it\\2", tokenisedText) tokenisedText <- gsub("-an-NIT-([iy]) ", "-ann it\\1 ", tokenisedText) tokenisedText <- gsub("ṛṣ([ia])", "-ṚṢ-\\1", tokenisedText) tokenisedText <- gsub("-v-AṂ([SŚṢ])-", "-vaṃ\\L\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-STHĀT- ", "-STH-āt ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-PADĀRTH-", "-PAD--ĀRTH-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-SARVADHARM-", "-SARV-a-DHARM-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-NĀM-APAD-", "-NĀM-a-PAD-", tokenisedText) tokenisedText <- gsub("--UBH-y", "-ubhy", tokenisedText) tokenisedText <- gsub("--ASMI([NṂ])([N])?-", "-asmi\\L\\1\\2", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-EKASMI([NṂ])([N])?-", "-EK-asmi\\L\\1\\2", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("([āa])-DHVAS-u", "-\\U1DHV-asu", tokenisedText) tokenisedText <- gsub("Y--ADHV-", "Y-adhv", tokenisedText) tokenisedText <- gsub("--MĀN-\\s", "-mān ", tokenisedText) tokenisedText <- gsub("([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])-MĀN-([ |-])", "\\1mān\\2", tokenisedText) tokenisedText <- gsub("b-AUDDHAS-", "-BAUDDH-s", tokenisedText) tokenisedText <- gsub("-ĀDĪ-", "-ĀD-ī", tokenisedText) tokenisedText <- gsub("ā-DAY-([ao])", "-ĀD-ay\\1", tokenisedText) tokenisedText <- gsub("a-NAL-p", "an-ALP-", tokenisedText) tokenisedText <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1-\\2", tokenisedText) tokenisedText <- gsub("nai-VAMIT-", " na-^EVAM-it", tokenisedText) tokenisedText <- gsub("na-IVA-", " na-^EVA-", tokenisedText) tokenisedText <- gsub(" i-TAD-", " -^ETAD-", tokenisedText) tokenisedText <- gsub("-ĀYĀMIT-i", "-āyām iti", tokenisedText) tokenisedText <- gsub("-MIT-i", "m iti", tokenisedText) tokenisedText <- gsub("sy-ĀMAH-", "-syāmah", tokenisedText) tokenisedText <- gsub("-bh-AYA([ṂM])-", "-BHAY-a\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-e-NAM-", " enam ", tokenisedText) tokenisedText <- gsub("me-NAM-", "m enam ", tokenisedText) tokenisedText <- gsub("E-ṣ([vu])-", " -eṣ\\1-", tokenisedText) #tokenisedText <- gsub("--AṂS--", "-aṃs-", tokenisedText) moved towards the end #tokenisedText <- gsub("--AṂŚ--", "-aṃś-", tokenisedText) tokenisedText <- gsub("ai-RABH-i", "air-ABHI", tokenisedText) tokenisedText <- gsub("ai-REBH-i", "air ebhi", tokenisedText) tokenisedText <- gsub("DUHIT--ROR-", "DUHIT-ror ", tokenisedText) tokenisedText <- gsub("-era-", "-er a-", tokenisedText) tokenisedText <- gsub("CI--DEV-", "CI-d ev", tokenisedText) tokenisedText <- gsub("-DEVAN-āga", "-DEV-a-NĀG-a", tokenisedText) tokenisedText <- gsub("r-ŪP([ĀAE])(.*?)-", "-RŪP-\\L\\1\\2", tokenisedText, ignore.case=FALSE, perl=TRUE) #tokenisedText <- gsub("r-ŪP([ĀAE])(.*?)-", "-RŪP-\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("dv-A(.*?)-", "-DV-a\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("ga-T(.*?)-", "-GAT-\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-NĀM--ABHI(.*?)-", "-NĀM-abhi\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("AŚR-u-TAMAT-", "a-ŚRUT-a-MAT-", tokenisedText) tokenisedText <- gsub("AŚR-u-TAMAT-", "a-ŚRUT-a-MAT-", tokenisedText) tokenisedText <- gsub("SAR-ṣi", " sa-RṢ-i", tokenisedText) tokenisedText <- gsub("m-ĀNAM-", " -MAN-am", tokenisedText) tokenisedText <- gsub("a-MĀR-ya", "am-ĀRY-a", tokenisedText) tokenisedText <- gsub("k-ĀY(.*?)-", "-KĀY-\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("m-AṂSY-", "-MAṂSY-", tokenisedText) tokenisedText <- gsub("k-ĀRAM-", "-KĀR-am", tokenisedText) tokenisedText <- gsub("-?pu-TRAD-ār", "-PUTR-a-DĀR-", tokenisedText) tokenisedText <- gsub("-ānā-MEN-ām", "-ānām enām", tokenisedText) tokenisedText <- gsub("-amā-NĀN-āmṃ", "-amānānām", tokenisedText) tokenisedText <- gsub("ar-CAYAN-t", "-ARCAY-ant", tokenisedText) tokenisedText <- gsub("-at-VAM-", "-atvam ", tokenisedText) tokenisedText <- gsub("ai-VAM-", "a evam ", tokenisedText) tokenisedText <- gsub("-t-VAMIT-i", "-tvam iti", tokenisedText) tokenisedText <- gsub("it-YAVY-ay", "iti -AVYAY-", tokenisedText) tokenisedText <- gsub("-t-VAMIT-y", "-tvam ity", tokenisedText) tokenisedText <- gsub("-KAŚCI--DEV-", "-KAŚCI-d ev", tokenisedText) tokenisedText <- gsub("CĀS-ya", "ca asya", tokenisedText) tokenisedText <- gsub(" -KULADUHIT--Ṛ", " -KULADUHIT-ṛ", tokenisedText) tokenisedText <- gsub("AK-ar-MAṆ-", "-a-KARM-aṇ", tokenisedText) tokenisedText <- gsub("(aya|AYA|ay-a|AY-a)--?MĀNAS-", "\\1-mānas", tokenisedText) tokenisedText <- gsub("-IṢY-at", "-iṣyat", tokenisedText) tokenisedText <- gsub("ṣy-ANTY-", "ṣyanty", tokenisedText) tokenisedText <- gsub("ṣ-YATY-", "ṣyaty", tokenisedText) tokenisedText <- gsub("--HĪṢYAT-", "-hīṣyat", tokenisedText) tokenisedText <- gsub("Y--ANTY-", "Y-anty ", tokenisedText) tokenisedText <- gsub("--ATY-", "-aty ", tokenisedText) tokenisedText <- gsub("-eṣ-VAN--", "-eṣv an-", tokenisedText) tokenisedText <- gsub("--AVY-", "-avy", tokenisedText) tokenisedText <- gsub("v-ĀCAY-", "-VĀCAY-", tokenisedText) tokenisedText <- gsub("-ĀYĀN--", "-āyān ", tokenisedText) tokenisedText <- gsub("--ĀYĀN-", "-āyān ", tokenisedText) tokenisedText <- gsub("--IRY-", "-ir y", tokenisedText) tokenisedText <- gsub("AK-ar-MĀṆ-", "a-KARM-āṇ", tokenisedText) tokenisedText <- gsub("b-UDDHAY-", "-BUDDH-ay", tokenisedText) tokenisedText <- gsub("b-UDDHĀN-", "-BUDDH-ān", tokenisedText) tokenisedText <- gsub("b-UDDHAS-", "-BUDDH-as", tokenisedText) tokenisedText <- gsub("b-ĀLĀN-", "-BĀL-ān", tokenisedText) tokenisedText <- gsub("b-UDDHAV-aca-NAM", "-BUDDH-a-VACAN-am", tokenisedText) tokenisedText <- gsub("da-ŚABH-i", "-DAŚ-abhi", tokenisedText) tokenisedText <- gsub("iya-MAS-ya", "-IYAM- asya", tokenisedText) tokenisedText <- gsub("i-TĪY-aṃ", "it-ĪYAṂ-", tokenisedText) tokenisedText <- gsub("i-TĪY-am", "it-ĪYAM-", tokenisedText) tokenisedText <- gsub("yath-ĀŚAKT-", "-YATH-ā-ŚAKT-", tokenisedText) tokenisedText <- gsub("-MĀTĀPIT--ROR-", "-MĀTĀPIT-ror ", tokenisedText) tokenisedText <- gsub("SAM--BHĀR-", "-SAMBHĀR-", tokenisedText) tokenisedText <- gsub("mo-HAM-", "-MOH-am", tokenisedText) tokenisedText <- gsub("-MIND-r", "m-INDR-", tokenisedText) tokenisedText <- gsub(" -MIND--RASY-", "m-INDR-asy", tokenisedText) tokenisedText <- gsub("-MIND--RAM-", "m-INDR-am", tokenisedText) tokenisedText <- gsub("--ACCHAT-r", "-a-CCHATR-", tokenisedText) tokenisedText <- gsub("ā-MASY-a", "ām asya", tokenisedText) tokenisedText <- gsub("a-MASY-a", "am asya", tokenisedText) tokenisedText <- gsub("-IṢY-([eā])", "-iṣy\\1", tokenisedText) tokenisedText <- gsub("ci-TR", "-CITR", tokenisedText) tokenisedText <- gsub("sa-H(A.?)-", "-SAH-\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("gṛ-H(A.?)-", "-GṚH-\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("V--AṂS-", "V-aṃs", tokenisedText) tokenisedText <- gsub("V--ANN-", "V-ann", tokenisedText) tokenisedText <- gsub("Y--ANN-", "Y-ann", tokenisedText) tokenisedText <- gsub("bha-V", "-BHAV", tokenisedText) tokenisedText <- gsub("-IND--RAM-", "-INDR-am", tokenisedText) tokenisedText <- gsub("ki-MIT-", "-KIM-it", tokenisedText) tokenisedText <- gsub("ji-NĀN-", "-JIN-ān", tokenisedText) tokenisedText <- gsub("j-INĀN-", "-JIN-ān", tokenisedText) tokenisedText <- gsub("-PRĀṆIK-oṭī-", "-PRĀṆ-i-KOṬ-ī", tokenisedText) tokenisedText <- gsub("([TYK])AIR-", "\\L\\1air", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("śu-CIR-", "-ŚUC-ir", tokenisedText) tokenisedText <- gsub("dā-NĀN-", "-DĀN-ān", tokenisedText) tokenisedText <- gsub("pa-RAM-", "-PARAM-", tokenisedText) tokenisedText <- gsub("e-DAM-", "-EDAM-", tokenisedText) tokenisedText <- gsub("iśca", "iś ca", tokenisedText) tokenisedText <- gsub("t-EṢA-", "teṣa", tokenisedText) tokenisedText <- gsub("pa-REṢ-ā", "-PAR-eṣā", tokenisedText) tokenisedText <- gsub("i-(M[AĀ]M)-", "-I\\1-", tokenisedText) tokenisedText <- gsub("k-ṚṢI-", "-KṚṢ-i", tokenisedText) tokenisedText <- gsub("yi-TUM-", "yitum", tokenisedText) tokenisedText <- gsub("--ITAM-", "-itam", tokenisedText) tokenisedText <- gsub("--TAVY-", "-tavy", tokenisedText) tokenisedText <- gsub("T-AVY-", "-tavy", tokenisedText) tokenisedText <- gsub("-YATHĀP-ī-DAM-", "-YATHĀPĪDAM-", tokenisedText) tokenisedText <- gsub("-as-MĀT", "-asmāt ", tokenisedText) tokenisedText <- gsub("MIND--REṆ-a", "m-INDR-eṇa", tokenisedText) #tokenisedText <- gsub("sūt-RA", "-SŪTR-a", tokenisedText) tokenisedText <- gsub("([^ā])d-EVA-", "\\1-DEV-a", tokenisedText) tokenisedText <- gsub("([eiī])-BHYA-ś-", "-\\1bhyaś", tokenisedText) tokenisedText <- gsub("-?-UBH-(i[ḥr])", "-ubh\\1", tokenisedText) tokenisedText <- gsub("-BAH--U", "-BAH-u-", tokenisedText) ### tokenisedText <- gsub(" t-V([AĀ])([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])?-", "-TV\\1\\2", tokenisedText) tokenisedText <- gsub("[ |-]t-ARH-", "-TARH-", tokenisedText) # tokenisedText <- gsub("AR-t-HAS", "-ARTH-as", tokenisedText) artha gsub # tokenisedText <- gsub("ĀR-t-HAS", "--ĀRTH-as", tokenisedText) ārtha gsub tokenisedText <- gsub("-([AĀ])YAN-t", "-\\L\\1yant", tokenisedText, perl=TRUE) tokenisedText <- gsub("-DUR--", "-DUR", tokenisedText) tokenisedText <- gsub("(\\s|-)d-EV", "\\1DEV", tokenisedText) tokenisedText <- gsub("SAR--VA", "SARV-a-", tokenisedText) tokenisedText <- gsub("sar-VA", "SARV-a-", tokenisedText) tokenisedText <- gsub("-MAL-p", "m-ALP-", tokenisedText) tokenisedText <- gsub("ā-RABH-", "-ĀRABH-", tokenisedText) tokenisedText <- gsub("h-ITAM-", "-HIT-am", tokenisedText) tokenisedText <- gsub("(ā)?g-ACCH([A-Z]+)-", "-\\U\\1GACCH-\\L\\2", tokenisedText, ignore.case=FALSE, perl=TRUE) # tokenisedText <- gsub("g-ATI(.*?)-", " -GAT-i\\L\\1-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-CIT--", " cit -", tokenisedText) tokenisedText <- gsub("-YĀN-([iuntīū])", " -yān\\1", tokenisedText) tokenisedText <- gsub("-YĀN--", " -yān", tokenisedText) tokenisedText <- gsub("--AVAŚ-", " -a-VAŚ-", tokenisedText) tokenisedText <- gsub("a-YATI-", " -ayati", tokenisedText) tokenisedText <- gsub("(-|\\s)s-U", "\\1SU", tokenisedText) tokenisedText <- gsub("(-|\\s)d-U", "\\1DU", tokenisedText) tokenisedText <-gsub("([aā]s)t(?![aā][mṃnñtd]|ena|eṣā[ṃmñ])([abcdefghlmnoprstāūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1 t\\2", tokenisedText, perl=TRUE) tokenisedText <- gsub("-ĀÑC-a", "-āñ ca", tokenisedText) tokenisedText <- gsub("(-[iaāṃ]?ś)c", "\\1 c", tokenisedText) tokenisedText <- gsub("([iaā]ṃś)c", "\\1 c", tokenisedText) tokenisedText <- gsub("-ĀNTAGRAHAṆ-", "-ĀNT-a-GRAHAṆ-", tokenisedText) tokenisedText <- gsub("-PRAJÑ--ĀJÑĀN-", "-PRAJÑ-ā-JÑĀN-", tokenisedText) tokenisedText <- gsub("-DṚṢṬV-ā", "-DṚṢṬ-vā", tokenisedText) tokenisedText <- gsub("ū-ROR-", "-ŪR-or", tokenisedText) tokenisedText <- gsub("p-ARAH-iṃs", "-PAR-a-HIṂS-", tokenisedText) tokenisedText <- gsub("-MĪT-([iy])", "mi it\\1", tokenisedText) tokenisedText <- gsub("([ĀA]-)cc", "\\1c c", tokenisedText) tokenisedText <- gsub("BHAGAV--ANTAM", "BHAGAV-antam", tokenisedText) tokenisedText <- gsub("([iī])-TAVY-([aāe])", "\\1tavy\\2", tokenisedText) tokenisedText <- gsub("--ŚAY-(an?t)", "Ś-ay\\1", tokenisedText) tokenisedText <- gsub("([^āiīū])nām", "\\1-NĀM-", tokenisedText) tokenisedText <- gsub("([āiīū])-NĀM-", "\\1nām", tokenisedText) tokenisedText <- gsub("nā-MA([TN])-", "-NĀM-a\\L\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("nāma-MAHĀYĀN", "-NĀM-a-MAHĀYĀN", tokenisedText) tokenisedText <- gsub("ta-MĀH-a", "tam -ĀH-a ", tokenisedText) # tokenisedText <- gsub("-ja(.)(\\s|-[A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-JA-\\1\\2", tokenisedText) # tokenisedText <- gsub("-aja(.)(\\s|-[A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-a-JA-\\1\\2",tokenisedText) tokenisedText <- gsub("([āaiīe])bh-YĀM-", "-\\1bhyām ", tokenisedText) tokenisedText <- gsub("-ĀNN-a ", "-ān na ", tokenisedText) tokenisedText <- gsub("apya-", "apy a-", tokenisedText) tokenisedText <- gsub("abhū-VAN-", "a-BHŪ-van", tokenisedText) tokenisedText <- gsub("-YĀM-([iī])", "yām\\1", tokenisedText) tokenisedText <- gsub("([aā]ś)c", "\\1 c", tokenisedText) tokenisedText <- gsub("([aā]ś)-C", "\\1 -C", tokenisedText) tokenisedText <- gsub("s-T", "s -T", tokenisedText) tokenisedText <- gsub("k-ṚT(.*?)-", "-KṚT-\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("ci-RAM-", "-CIRA-m", tokenisedText) tokenisedText <- gsub("-X-", " ", tokenisedText) tokenisedText <- gsub("-SAC-ed", "-SACED-", tokenisedText) tokenisedText <- gsub("-ama-", "-am a-", tokenisedText) tokenisedText <- gsub("-ai-RAV-a-", "-air -AVA", tokenisedText) tokenisedText <- gsub("--APAR-", "a-PAR-", tokenisedText) tokenisedText <- gsub("-PRATY--UP([ĀA])", "-PRATYUP\\1", tokenisedText) tokenisedText <- gsub("-SUKHAM--([^Ā])", "-SUKH-am-\\1", tokenisedText) tokenisedText <- gsub("-VADĀM-([iy])", "-VAD-ām\\1", tokenisedText) ##tokenisedText <- gsub("ṣy-ATĪT-i", "ṣyati iti", tokenisedText) tokenisedText <- gsub("hy(āaeouū)", "hy \\1", tokenisedText) tokenisedText <- gsub("-at-VĀD-a", "-atvād a", tokenisedText) tokenisedText <- gsub("kim(iīāaeouū)", "kim \\1", tokenisedText) ##tokenisedText <- gsub("([aā]ṃ)-([sS])([tT])", "\\1\\L\\2-\\3", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("ETADAVOCA([TDN])", "ETAD- -AVOCA\\1", tokenisedText) tokenisedText <- gsub("-IBH-([īyi])", "-ibh\\1", tokenisedText) tokenisedText <- gsub("vi-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-VI\\1", tokenisedText) tokenisedText <- gsub("vī-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-VĪ\\1", tokenisedText) tokenisedText <- gsub("vy-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-VY\\1", tokenisedText) tokenisedText <- gsub("ut-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-UT\\1", tokenisedText) tokenisedText <- gsub("ud-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-UD\\1", tokenisedText) tokenisedText <- gsub("pra-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-PRA\\1", tokenisedText) tokenisedText <- gsub("pr-([AĀE])", "-PR\\1", tokenisedText) tokenisedText <- gsub("([-| ])p-R", "\\1PR", tokenisedText) tokenisedText <- gsub("abhi-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ABHI\\1", tokenisedText) tokenisedText <- gsub("ābhi-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ĀBHI\\1", tokenisedText) tokenisedText <- gsub("abhy-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ABHY\\1", tokenisedText) tokenisedText <- gsub("ābhy-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ĀBHY\\1", tokenisedText) tokenisedText <- gsub("saṃ-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-SAṂ\\1", tokenisedText) tokenisedText <- gsub("sam-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "--SAM\\1", tokenisedText) tokenisedText <- gsub("san-(N)", "-SAN\\1", tokenisedText) tokenisedText <- gsub("-SAM--([AĀIĪOEUŪYM])", "-SAM\\1", tokenisedText) tokenisedText <- gsub("-SAN--N", "-SANN", tokenisedText) tokenisedText <- gsub("anu-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ANU\\1", tokenisedText) tokenisedText <- gsub("ānu-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ĀNU\\1", tokenisedText) tokenisedText <- gsub("anū-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ANŪ\\1", tokenisedText) tokenisedText <- gsub("anv-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ANV\\1", tokenisedText) tokenisedText <- gsub("ānū-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ĀNŪ\\1", tokenisedText) tokenisedText <- gsub("ānv-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ĀNV\\1", tokenisedText) tokenisedText <- gsub("para-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-PARA\\1", tokenisedText) tokenisedText <- gsub("par-([ĀOE])", "-PAR\\1", tokenisedText) tokenisedText <- gsub("pari-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-PARI\\1", tokenisedText) tokenisedText <- gsub("parī-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-PARĪ\\1", tokenisedText) tokenisedText <- gsub("pary-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-PARY\\1", tokenisedText) tokenisedText <- gsub("upa-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-UPA\\1", tokenisedText) tokenisedText <- gsub("ūpa-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ŪPA\\1", tokenisedText) tokenisedText <- gsub("opa-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-OPA\\1", tokenisedText) tokenisedText <- gsub("up-([ĀOE])", "-UP\\1", tokenisedText) tokenisedText <- gsub("op-([ĀOE])", "-OP\\1", tokenisedText) tokenisedText <- gsub("ūp-([ĀOE])", "-ŪP\\1", tokenisedText) tokenisedText <- gsub("prati-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-PRATI\\1", tokenisedText) tokenisedText <- gsub("pratī-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-PRATĪ\\1", tokenisedText) tokenisedText <- gsub("praty-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-PRATY\\1", tokenisedText) tokenisedText <- gsub("adhi-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ADHI\\1", tokenisedText) tokenisedText <- gsub("ādhi-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ĀDHI\\1", tokenisedText) tokenisedText <- gsub("adhy-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ADHY\\1", tokenisedText) tokenisedText <- gsub("ādhy-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-ĀDHY\\1", tokenisedText) # the following ava substitutions risk interfering with -au endings # tokenisedText <-gsub("([aā])va-((?![ṂN])[A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-\\U\\1VA\\2", tokenisedText , perl=TRUE) # tokenisedText <-gsub("([aā])v-([OE])", "-\\U\\1V\\2", tokenisedText, perl=TRUE) # tokenisedText <-gsub("([aā])v-Ā([^ṂN])", "-\\U\\1VĀ\\2", tokenisedText, perl=TRUE) # tokenisedText <- gsub("vyā-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-VYĀ\\1", tokenisedText) tokenisedText <- gsub("(-āṇ)-Y", "-āṇy ", tokenisedText) #tokenisedText <- gsub("[^VM]-(an?t[eiy]|ot[yi]|as[ie])((!?[ḥmṃsr])[a-z]|[āīūṛḷṇḍñṅśṣṭḍṃ])", "-\\1 \\2", tokenisedText, perl=TRUE) ###tokenisedText <- gsub("([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])-ĀNY(.)?-([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1-āny \\2\\3", tokenisedText) ###tokenisedText <- gsub("--ĀNY(.)?-([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "-āny \\L\\1\\2", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("--(Ā[NṆ]Y)-(?!atv|e)", "-\\L\\1 \\2", tokenisedText, perl=TRUE) # tokenisedText <- gsub("K-ṛty", "-KṚTY-", tokenisedText) tokenisedText <- gsub("K-ṛt", "-KṚT-", tokenisedText) tokenisedText <-gsub("t-VAS-au", " tv asau ", tokenisedText) tokenisedText <-gsub("-VAS--TRASY-", "-VASTR-asy", tokenisedText) tokenisedText <-gsub("-VAS--TRAPR-a-", "-VASTR-a-PRA", tokenisedText) tokenisedText <-gsub("-DHARMAT--ATT-v", "-DHARMAT-a-TATT-v", tokenisedText) tokenisedText <- gsub("-BH-y", "-bhy", tokenisedText) tokenisedText <- gsub("y-ADY-", " yady ", tokenisedText) #tokenisedText <- gsub("-s-VA", "-SVA", tokenisedText) tokenisedText <- gsub("s-VASY-", "-SV-asy", tokenisedText) # tokenisedText <- gsub("([ |-])s-VA", "\\-SVA", tokenisedText) tokenisedText <- gsub("([ |-])s-VĀ", " -SV--Ā", tokenisedText) tokenisedText <- gsub("sva-", " -SV-a-", tokenisedText) tokenisedText <- gsub("-SVASĀMĀNY-", " -SV-a-SĀMĀNY-", tokenisedText) tokenisedText <- gsub("sv-AYA([MṂ])-", "-SVAY-a\\L\\1 ", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("ca-RAT-", "-CARAT-", tokenisedText) tokenisedText <- gsub("-ĀCIT--TAMAN-", "ā-CITT-a-MAN-", tokenisedText) tokenisedText <- gsub("d-VAU-", "-DV-au", tokenisedText) tokenisedText <- gsub("d-VĀV-", "-DV-āv", tokenisedText) tokenisedText <- gsub("Ś--UDDH(.*?)-", "-ŚUDDH-\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("k-ĀNTAT-", "-KĀNT-at", tokenisedText) tokenisedText <- gsub("-PŪR--VASY-", "-PŪRV-asy", tokenisedText) tokenisedText <- gsub("-ācc", "-āc c", tokenisedText) tokenisedText <- gsub("-ān ām", "-ānām ", tokenisedText) tokenisedText <- gsub("pa-REṢ-u", "-PAR-eṣu", tokenisedText) tokenisedText <- gsub("ki-MAT-ra", "kim atra", tokenisedText) tokenisedText <- gsub("-kā-REṆ-", "-KĀR-eṇ", tokenisedText) tokenisedText <- gsub("k-ARTUK-", "-KARTUK-", tokenisedText) tokenisedText <- gsub("s-AMYA-([gk])-", "-SAMYA-\\1", tokenisedText) tokenisedText <- gsub("m-ĀRAS-enā", "-MĀR-a-SEN-ā", tokenisedText) tokenisedText <- gsub("-?n-ĀMAK-āy-", "-NĀM-a-KĀY-", tokenisedText) tokenisedText <- gsub("-MIT-y", "m ity", tokenisedText) tokenisedText <- gsub("-?s-APTA", "-SAPT-a", tokenisedText) tokenisedText <- gsub("--AŚR--AVAŚ", "a-ŚRAV-āś", tokenisedText) tokenisedText <- gsub("T-riṃ-ŚAD-", "-TRIṂŚAD-", tokenisedText) tokenisedText <- gsub("-ā-DĪN-", "-ādīn", tokenisedText) tokenisedText <- gsub("--ĀNĪT--Y", "-ān īty ", tokenisedText) tokenisedText <- gsub("j-ĀNĪT-", "-JĀN-īt", tokenisedText) tokenisedText <- gsub("j-ĀTASY-", "-JĀT-asy", tokenisedText) tokenisedText <- gsub("j-ĀTIJAR-", "-JĀT-i-JAR-", tokenisedText) tokenisedText <- gsub("j-ĀTIR-", "-JĀT-ir ", tokenisedText) tokenisedText <- gsub("itīmāny", "it^īmāny ", tokenisedText) tokenisedText <- gsub("g-URUR-", "-GUR-ur", tokenisedText) tokenisedText <- gsub("gaṃ-BHĪR-", "-GAMBHĪR-", tokenisedText) #tokenisedText <- gsub("Y-og", "-YOG-", tokenisedText) tokenisedText <- gsub("-DEVĀNĀMINDR-", "-DEV-ānām-INDR-", tokenisedText) tokenisedText <- gsub("-JĪVA-", "-JĪV-a", tokenisedText) tokenisedText <- gsub("d-VAYASY-", "-DVAY-asy", tokenisedText) tokenisedText <- gsub("-CIRASY-", "-CIR-asy", tokenisedText) tokenisedText <- gsub("([ |-])m-ĀRASY-", "\\1MAR-asy", tokenisedText) tokenisedText <- gsub("j-ĀLASY-", "-JĀL-asy", tokenisedText) tokenisedText <- gsub("mada([mṃ])", "-MAD-a\\1", tokenisedText) tokenisedText <- gsub("madasy", "-MAD-asy", tokenisedText) tokenisedText <- gsub("maden", "-MAD-en", tokenisedText) tokenisedText <- gsub("-VAN-([^ae])", "van\\1", tokenisedText) tokenisedText <- gsub("namo", "-NAM-o", tokenisedText) tokenisedText <- gsub("namas", "-NAM-as", tokenisedText) tokenisedText <- gsub("a-NAY-ā", "anayā", tokenisedText) tokenisedText <- gsub("a-NAY-o([rḥ])", "anay\\1", tokenisedText) tokenisedText <- gsub("-a-MAS-a-MAM-", "am a-SAM-am ", tokenisedText) tokenisedText <- gsub("sa-MAM-", "-SAM-am ", tokenisedText) tokenisedText <- gsub("ra-MAM-", "-RAM-am ", tokenisedText) tokenisedText <- gsub("e-VĀSY-a", "ev^āsya ", tokenisedText) tokenisedText <- gsub("de-VAIR-", "-DEV-air ", tokenisedText) tokenisedText <- gsub("e-VAMAN-y", "-EVAM--ANY- ", tokenisedText) tokenisedText <- gsub("e-VĀSAN-e", "ev-ĀSAN-e ", tokenisedText) tokenisedText <- gsub("e-VĀSAN-e", "ev-ĀSAN-e ", tokenisedText) tokenisedText <- gsub("-AYAMEV-a", "-AYAM--EVA-", tokenisedText) tokenisedText <- gsub("-AYAMEV--", "-AYAM-ev-", tokenisedText) tokenisedText <- gsub("-AYAMEV-([^-a])", "-AYAM-ev\\1", tokenisedText) tokenisedText <- gsub("e-VAMAN-y", "-EVAM--ANY-", tokenisedText) tokenisedText <- gsub("e-VAMAN--", "-EVAM--an-", tokenisedText) tokenisedText <- gsub("de-VAKUL-", "-DEV-a-KUL-", tokenisedText) tokenisedText <- gsub("al-PĀN-", "-ALP-ān", tokenisedText) tokenisedText <- gsub("-ĀNAMIT-([iīy])", "ānam it\\1", tokenisedText) tokenisedText <- gsub("a-STH-([iīy])", "-ASTH-\\1", tokenisedText) tokenisedText <- gsub("tāy([īi])", "-TĀY-\\1", tokenisedText) tokenisedText <- gsub("de-YAVAS-tu", "-DEY-a-VAST-u", tokenisedText) tokenisedText <- gsub("yā-CAT-([iey])", "-YĀC-at\\1", tokenisedText) tokenisedText <- gsub("-NĀN-y", "n-ĀNY-", tokenisedText) tokenisedText <- gsub("t-VAY-", "tvay", tokenisedText) tokenisedText <- gsub("ga-MIṢ-", "-GAM-iṣ", tokenisedText) tokenisedText <- gsub("t-ĀVAC-c", "-TĀVAC-c", tokenisedText) tokenisedText <- gsub("y-ĀVAC-c", "-YĀVAC-c", tokenisedText) tokenisedText <- gsub("ta-TRĀYA-ṃ", "tatr-ĀYAṂ-", tokenisedText) tokenisedText <- gsub("d-VIVIDH-", "-DVIVIDH-", tokenisedText) tokenisedText <- gsub("ānt-AÑC-", "āntañc", tokenisedText) tokenisedText <- gsub(" t-AÑC-", " tañc", tokenisedText) tokenisedText <- gsub("et-AÑC-", "-etañc", tokenisedText) tokenisedText <- gsub("de-YAVAS-tu", "-DeY-a-VAS-tu", tokenisedText) tokenisedText <- gsub("-a-PI-", "-API-", tokenisedText) tokenisedText <- gsub("-a-NY-", "-ANY-", tokenisedText) tokenisedText <- gsub("k-ARMAK-ā-REṆ", "-KARMAKĀR-eṇ", tokenisedText) tokenisedText <- gsub("-a-NN-", "-ann ", tokenisedText) tokenisedText <- gsub("-a-ṢṬ-a-", "-AṢṬ-a-", tokenisedText) tokenisedText <- gsub("-PHALAS-ya", "-PHAL-asya", tokenisedText) tokenisedText <- gsub("ityebhi", "ity ebhi", tokenisedText) tokenisedText <- gsub("ityebhi", "ity ebhi", tokenisedText) tokenisedText <- gsub("e-VAÑC-a", "-EVAÑ-ca", tokenisedText) tokenisedText <- gsub("dā-NAM-", "-DĀN-am", tokenisedText) tokenisedText <- gsub("dā-NAS-y", "-DĀN-asy", tokenisedText) tokenisedText <- gsub("ub-HAU-", "-UBH-au", tokenisedText) tokenisedText <- gsub("s-YĀDA--", "syād a-", tokenisedText) tokenisedText <- gsub("s-YĀN-", "syān ", tokenisedText) tokenisedText <- gsub("vīr-YAMIT-y","-VĪRY-am ity", tokenisedText) tokenisedText <- gsub("d-ĀNAS-ya","-DĀN-asya", tokenisedText) tokenisedText <- gsub("it-YEṢA-","ity eṣa", tokenisedText) tokenisedText <- gsub("ra-KṢAT-","-RAKṢAT-", tokenisedText) tokenisedText <- gsub("pa-REBH-y", "-PAR-ebhy", tokenisedText) tokenisedText <- gsub("śi-KṢ(.*?)-", "-ŚIKṢ-\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) #tokenisedText <- gsub("--A", "-a-", tokenisedText) tokenisedText <- gsub("ta-DASY-", "tad asya", tokenisedText) tokenisedText <- gsub("y-ANN-a", "yan na", tokenisedText) tokenisedText <- gsub("itī-MĀNY-", "itīmāny", tokenisedText) tokenisedText <- gsub("ce-MĀNY-", "cemāny", tokenisedText) tokenisedText <- gsub("he-TUBH-", "-HETU-bh", tokenisedText) tokenisedText <- gsub("iya-MAS-y", "iyam asy", tokenisedText) tokenisedText <- gsub("ne-CCHATI-", "n-ECCH-ati", tokenisedText) tokenisedText <- gsub("ny-ĀYAṂ-", "-NYĀY-aṃ", tokenisedText) tokenisedText <- gsub("ny-ĀYAM-", "-NYĀY-am", tokenisedText) tokenisedText <- gsub("r-ĀJÑĀ-", "-RĀJ-ñā", tokenisedText) tokenisedText <- gsub("lo-KAM-", "-LOKA-m", tokenisedText) tokenisedText <- gsub("as-MIN-n ", "asminn ", tokenisedText) tokenisedText <- gsub("as-MIN-", "asmin ", tokenisedText) tokenisedText <- gsub("kā-MAS-ya", "-KĀM-asya", tokenisedText) tokenisedText <- gsub("AK-ā-MAS-ya", "-a-KĀM-asya", tokenisedText) tokenisedText <- gsub("AK-ā-MAS-ya", "-a-KĀM-asya", tokenisedText) tokenisedText <- gsub("IK-ā-MAS-ya", "-i-KĀM-asya", tokenisedText) tokenisedText <- gsub("dā-NĀD-īn", "-DĀN--adīn", tokenisedText) tokenisedText <- gsub("ma-MET-i", "-MAM-eti ", tokenisedText) tokenisedText <- gsub("ma-MET-y", "-MAM-ety ", tokenisedText) tokenisedText <- gsub("va-HAT-", "-VAH-at", tokenisedText) tokenisedText <- gsub("-āś-AYAṂ-", "--ĀŚAYAṂ-", tokenisedText) tokenisedText <- gsub("-āś-AYAM-", "--ĀŚAYAM-", tokenisedText) tokenisedText <- gsub("-GUṆAY--UKT-", "-GUṆ-a-YUKT-", tokenisedText) tokenisedText <- gsub("s-ARVĀK-ār", "-SARV--ĀKĀR-", tokenisedText) tokenisedText <- gsub("-TIS--ṚBH-", "-TISṚ-bh", tokenisedText) tokenisedText <- gsub("d-ĀSĪD-ā-SAKARMAK-ar", "-DĀS-ī-DĀS-a-KARM-a-KAR-", tokenisedText) tokenisedText <- gsub("he-TAVA-", "-HET-ava", tokenisedText) tokenisedText <- gsub("-TATKAR-m", "tat-KARM-", tokenisedText) tokenisedText <- gsub("-SARVAT--ĪRT-hy", "-SARV-a-TĪRTHY-", tokenisedText) tokenisedText <- gsub("iya-MEṢ-āṃ ", "iyam eṣāṃ ", tokenisedText) tokenisedText <- gsub("pa-RASY-a", "-PAR-asya", tokenisedText) tokenisedText <- gsub("t-ATT-eṣā", "tat teṣā", tokenisedText) tokenisedText <- gsub("tyā-JAYAT-", "-TYĀJ-ayat", tokenisedText) tokenisedText <- gsub("-YĀS--MIN-", " yāsmin ", tokenisedText) tokenisedText <- gsub("d-VIRAP-i", "-DV-ir-API-", tokenisedText) tokenisedText <- gsub("go-TRAS-a([ṃm])-", "-GOTR-a-SA\\1", tokenisedText) tokenisedText <- gsub("ku-LASA-ṃ-P", "-KUL-a-SAMP", tokenisedText) tokenisedText <- gsub("ma-RAṆA", "-MARAṆ-a-", tokenisedText) tokenisedText <- gsub("pa-DĀN", "-PAD-ān", tokenisedText) tokenisedText <- gsub("pa-RAÑC-a", "-PAR-añ ca", tokenisedText) tokenisedText <- gsub("ś-ĀSTĀR-", "-ŚĀST-ār", tokenisedText) tokenisedText <- gsub("ya-DEV-a", "yad eva", tokenisedText) tokenisedText <- gsub("y-AÑC-a", "yañ ca", tokenisedText) tokenisedText <- gsub("-VAÑC-a ", "-vañ ca", tokenisedText) tokenisedText <- gsub("-ānt ", "-vañ ca", tokenisedText) tokenisedText <- gsub("-ānā-MADH-i", "-ānām adhi", tokenisedText) tokenisedText <- gsub("c-ĀRIṆ-", "-CĀR-iṇ", tokenisedText) tokenisedText <- gsub("as-MĀD", " asmād ", tokenisedText) tokenisedText <- gsub("-KĀY-avāṅ-MANASK-ar-MAṆ-", "-KĀY-a-VĀṄ--MANAS--KARMAṆ-", tokenisedText) tokenisedText <- gsub("v-ĀCAM-", "-VĀC-am ", tokenisedText) tokenisedText <- gsub("s-ARVĀK-ā-REṆ-", "-SARV--ĀKĀR-eṇ", tokenisedText) tokenisedText <- gsub("-VIDHAM-", "-VIDH-am", tokenisedText) tokenisedText <- gsub("-VIDHĀN-", "-VIDH-ān", tokenisedText) tokenisedText <- gsub("-YĀNAT--RAY-", "-YĀN-a-TRAY-", tokenisedText) tokenisedText <- gsub("-SAṂS--RAY-", "-SAṂSRAY-", tokenisedText) tokenisedText <- gsub("e-MĀNI-", " emāni ", tokenisedText) tokenisedText <- gsub("i-MĀNI-", " imāni ", tokenisedText) tokenisedText <- gsub("l-ĀBHINA-", "-LĀBH-ina", tokenisedText) tokenisedText <- gsub("-yamā-NA([ŚM])-", "-yamāna\\1 ", tokenisedText) tokenisedText <- gsub("Y-amā-NA([ŚM])-", "-yamānaś\\1 ", tokenisedText) tokenisedText <- gsub("mā-TRAK-", "-MĀTRAK-", tokenisedText) tokenisedText <- gsub("mā-TRAK-", "-MĀTRAK-", tokenisedText) tokenisedText <- gsub("mā-TRAY-", "-MĀTR-ay", tokenisedText) tokenisedText <- gsub("mā-TRASY-", "-MĀTR-asy", tokenisedText) tokenisedText <- gsub("mā-NABH-", "-MĀN-abh", tokenisedText) tokenisedText <- gsub("da-DĀT-i", "-DAD-āti", tokenisedText) #reconsider this if you want to separate redulplicaiton from stem tokenisedText <- gsub("tasya-IVA-", "tasy-AIVA-", tokenisedText) tokenisedText <- gsub("-BHAVANT-([iy])", "-BHAV-ant\\1 ", tokenisedText) tokenisedText <- gsub("-SAM-āsa ", "-SAMĀS-a ", tokenisedText) tokenisedText <- gsub("-SAM-āsa- ", "-SAMĀS-a-", tokenisedText) tokenisedText <- gsub("-SAM-ā-SĀRTH", "-SAMĀS--ĀRTH", tokenisedText) tokenisedText <- gsub("d-VYĀKĀR-", "-DV-y-ĀKĀR-", tokenisedText) tokenisedText <- gsub("da-YĀCIT-t", "-DAY-ā-CITT-", tokenisedText) tokenisedText <- gsub("dā-YĀDA-", "-DAYĀD-a", tokenisedText) tokenisedText <- gsub("da-ŚABH-", "-DAŚ-abh", tokenisedText) tokenisedText <- gsub("iyam-", "iyam ", tokenisedText) tokenisedText <- gsub("-YASMĀ-d-", "yasmād ", tokenisedText) tokenisedText <- gsub("-YASMĀ-d-", "yasmād ", tokenisedText) tokenisedText <- gsub("go-TRAST-h", "-GOTR-a-STH-", tokenisedText) tokenisedText <- gsub("go-TRAST-h", "-GOTR-a-STH-", tokenisedText) tokenisedText <- gsub("a-IVA-", "-AIVA-", tokenisedText) tokenisedText <- gsub("-STEN-", "s ten", tokenisedText) tokenisedText <- gsub("-BHAVAT-([iy])", "-BHAV-at\\1", tokenisedText) tokenisedText <- gsub("itya-", "ity a", tokenisedText) tokenisedText <- gsub("kā-NICIT-", " kānicit ", tokenisedText) tokenisedText <- gsub("m-ADHYASTH-ān", "-MADHY-a-STHĀN-", tokenisedText) tokenisedText <- gsub("m-ADHYASTH-", "-MADHY-a-STH-", tokenisedText) tokenisedText <- gsub("k-AÑCIT-", " kañcit ", tokenisedText) tokenisedText <- gsub("kā-MA(.*?)-", "-KĀM-a\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-MAS-ya", "m asya ", tokenisedText) tokenisedText <- gsub("-a-TĪT-([iy])", "at^īt\\1", tokenisedText) tokenisedText <- gsub("dā-NAMAN-", "-DĀN-am an", tokenisedText) tokenisedText <- gsub("dā-NĀDIK-", "-DĀN--ĀDIK-", tokenisedText) tokenisedText <- gsub("dā-NĀD-", "-DĀN--ād", tokenisedText) tokenisedText <- gsub("j-ĀTAMĀ--TRASY-", "-JĀT-a-MĀTR-asy", tokenisedText) tokenisedText <- gsub("j-ANYAT-āyo", "-JANYAT-āyo", tokenisedText) tokenisedText <- gsub("j-IHA-vāṃ", "-JIHA-vāṃ", tokenisedText) tokenisedText <- gsub("j-ĀNĪY-", "-JĀNĪY-", tokenisedText) tokenisedText <- gsub("j-ANYAK--ĀYACIT-ta", "-JANY-a-KĀY-a-CITT-a", tokenisedText) tokenisedText <- gsub("j-ĀTIJAN--MANA-ḥ ", "-JĀTI-JAN-manaḥ ", tokenisedText) tokenisedText <- gsub("j-ĀTIDRAV-ya", "-JĀT-i-DRAVY-a", tokenisedText) tokenisedText <- gsub("j-ĀYAT-e", "-JĀY-ate", tokenisedText) tokenisedText <- gsub("j-ALAJ-air ", "-JALA--jair ", tokenisedText) tokenisedText <- gsub("j-ĀLĀN-i", " -JĀL-āni", tokenisedText) tokenisedText <- gsub("j-ĀLAYAN-tra", "-JĀL-a-YANTR-a", tokenisedText) tokenisedText <- gsub("-AKLĀNTAK--ĀYACIT-t", "-AKLĀNT-a-KĀY-a-CITT-", tokenisedText) tokenisedText <- gsub("k-ARH-i", " karhi ", tokenisedText) tokenisedText <- gsub("k-ĀMAK-", "-KĀM-ak", tokenisedText) tokenisedText <- gsub("k-ALPAK-oṭ", "-KALP-a-KOṬ-", tokenisedText) tokenisedText <- gsub("-KṚṢṆAŚ-uk-LASA-", "-KṚṢṆ-a-ŚUKL-asa", tokenisedText) tokenisedText <- gsub("k-ĀNTIM-", "-LĀNT-im", tokenisedText) tokenisedText <- gsub("k-ALPAK-", "-KALP-ak", tokenisedText) tokenisedText <- gsub("KARMAS-thā-NĀN", "KARMASTHĀN-ān", tokenisedText) tokenisedText <- gsub("tasyāṅk-URASY--", "tasy-AṄKUR-asy", tokenisedText) tokenisedText <- gsub("k-ARMAK-ar", "-KARM-a-KAR-", tokenisedText) tokenisedText <- gsub("k-ṚCCH-at-", "-KṚCCHAT-", tokenisedText) tokenisedText <- gsub("a-KĀL-a-S-evī", "ak-ĀLAS-evī", tokenisedText) tokenisedText <- gsub("ek-ĀGR-", "-EKĀGR-", tokenisedText) tokenisedText <- gsub("k-ĀRYĀṆ-", "-KĀRY-āṇ", tokenisedText) tokenisedText <- gsub("k-ATYĀKĀR-", "-KATY-ĀKĀR-", tokenisedText) tokenisedText <- gsub("ek-ĀNN-a-VIṂŚAT", "-EKĀNNAVIṂŚAT", tokenisedText) tokenisedText <- gsub("yuk-TAPAR-i-HĀS-", "-YUKT-a-PARIHĀS-", tokenisedText) tokenisedText <- gsub("-LAU-k-ĀNUGRAH-a", "-LAUK--ĀNUGRAH-a", tokenisedText) tokenisedText <- gsub("SAMPATT-ik-RAL--PAN-ā", "SAMPATT-i-KR--ALPAN-ā", tokenisedText) tokenisedText <- gsub("yuk-TAMAK-he-DAMAY-āc", "-YUKT-am-KHED-a-MAY-āc", tokenisedText) tokenisedText <- gsub("yathok-TĀY-", "-YATHOKT-āy", tokenisedText) tokenisedText <- gsub("k-ĀRYAK--RIY-", "-KĀRYAKRIY-", tokenisedText) tokenisedText <- gsub("-SARVAL-auk-Ā", "-SARV-a-LAUK--Ā", tokenisedText) tokenisedText <- gsub("-SARVAŚ-ak-RATV-", "-SARV-a-ŚAKR-atv", tokenisedText) tokenisedText <- gsub("k-ĀRASY-a", "-KĀR-asya", tokenisedText) tokenisedText <- gsub("suśuk-LADAN-t", "su-ŚUKL-a-DANT-", tokenisedText) tokenisedText <- gsub("-DEVAL-ok-", "-DEV-a-LOK-", tokenisedText) tokenisedText <- gsub("-āṅk-APĀD", "-ĀṄK-a-PĀD", tokenisedText) tokenisedText <- gsub("-āṅk-APĀD", "-ĀṄK-a-PĀD", tokenisedText) tokenisedText <- gsub("p-ARVA-", "-PARVA-", tokenisedText) tokenisedText <- gsub("-SARVAD-uḥkh-", "-SARV-a-DUḤKH-", tokenisedText) tokenisedText <- gsub("SAT--KUR-yurg-URŪK-uryur--", "-SATKUR-yur-GUR-u-KUR-yur", tokenisedText) tokenisedText <- gsub("a-RVĀ-g-VIDAR--ŚAṂY-ati", "-ARVĀG--VIDARŚAY-ati", tokenisedText) tokenisedText <- gsub("ag-RĀṆ-", "-AGR-āṇ", tokenisedText) tokenisedText <- gsub("g-URŪK-", "-GUR-ūk", tokenisedText) tokenisedText <- gsub("ch-ĀYĀS-", "-CHĀY-ās", tokenisedText) tokenisedText <- gsub("jñ-ĀNAPH-al", "-JÑĀN-a-PHAL-", tokenisedText) tokenisedText <- gsub("jñ-EYASY-", "-JÑEY-asy", tokenisedText) tokenisedText <- gsub("jñ-ĀTIVṚDDH-", "-JÑĀT-i-VṚDDH-", tokenisedText) tokenisedText <- gsub("jñ-ĀNAMAY-", "-JÑĀNAMAY-", tokenisedText) tokenisedText <- gsub("it-YAY-añ-", "it-YAYAÑ-", tokenisedText) tokenisedText <- gsub("cod-ĀRABH-og-", "c-ODĀR-a-BHOG-", tokenisedText) tokenisedText <- gsub("d-ĀSĀD-([iy])", "-DĀS--ĀD-\\1", tokenisedText) tokenisedText <- gsub("d-ĀNAMAY-", "-DĀNAMAY-", tokenisedText) tokenisedText <- gsub("od-AYA([MṂ])-", "-ODAY-a\\L\\1", tokenisedText, perl = TRUE) tokenisedText <- gsub("d-VĀV-", "-DV-āv", tokenisedText) tokenisedText <- gsub("od-ĀRASY-a", "-ODĀR-asya", tokenisedText) tokenisedText <- gsub("od-ĀRASY-a", "-ODĀR-asya", tokenisedText) tokenisedText <- gsub("-īñc", "-īñ c", tokenisedText) tokenisedText <- gsub("-([āa])ñce([dt])", "-\\1ñ ce\\2", tokenisedText) tokenisedText <- gsub("d-VIDH", "-DVIDH", tokenisedText) tokenisedText <- gsub("-PARAR-d-DHY-", "-PAR-a-RDDH-y", tokenisedText) tokenisedText <- gsub("d-URVAR-ṇa", "dur-VARṆ-a", tokenisedText) tokenisedText <- gsub("d-URVAR-ṇa", "dur-VARṆ-a", tokenisedText) tokenisedText <- gsub("-SARVAR-dd-HIM-at", "-SARV-a-RDDHIM-at", tokenisedText) tokenisedText <- gsub("ṛd-DHYĀT-ad-", "-ṚDDHY-ā tad-", tokenisedText) tokenisedText <- gsub("-VAST-u-NAST-ad-", "-VAST-unas tad-", tokenisedText) tokenisedText <- gsub("vid-YĀBH-", "-VIDHY-ābh", tokenisedText) tokenisedText <- gsub("vid-YĀBH-", "-VIDY-ābh", tokenisedText) tokenisedText <- gsub("vid-YĀDA-y", "-VIDY-ā-DAY-", tokenisedText) tokenisedText <- gsub("-ai-RAV-id-YĀDA-y", "-air a-VIDY-ā-DAY-", tokenisedText) tokenisedText <- gsub("od-DIŚAT-([iy])", "-ODDIŚ-at\\1", tokenisedText) tokenisedText <- gsub("ā-DASM-ād-", "ād asmād-", tokenisedText) tokenisedText <- gsub("ā-DASM-ād-", "ād asmād-", tokenisedText) tokenisedText <- gsub("d-ĀTĀR-", "-DĀTĀR-", tokenisedText) tokenisedText <- gsub("d-ĀPAY-", "-DĀPAY-", tokenisedText) tokenisedText <- gsub("d-ĀNĀY-", "-DĀNĀY-", tokenisedText) tokenisedText <- gsub("ś-ĪLAV-a", "-ŚĪL-ava", tokenisedText) tokenisedText <- gsub("d-ĀNAPH-al", "-DĀN-a-PHAL-", tokenisedText) tokenisedText <- gsub("a-MASM-ād-", "am asmād-", tokenisedText) tokenisedText <- gsub("d-ĀNAMIT-", "-DĀN-am it-", tokenisedText) tokenisedText <- gsub("d-ĀNAS-y", "-DĀN-asy", tokenisedText) tokenisedText <- gsub("s-AMYA-ṅ-", "-SAMYA-ṅ", tokenisedText) tokenisedText <- gsub("d-AṆḌAK-arm-", "-DAṆḌ-a-KARM-", tokenisedText) tokenisedText <- gsub("ā-MAS-ad-VĀC-aṃ", "-ām a-SAD-VĀC-aṃ", tokenisedText) tokenisedText <- gsub("d-AUR-ma-SYAM-", "-DAURMASY-am", tokenisedText) tokenisedText <- gsub("d-VIRAP-y-", "-DVI-r-APY-", tokenisedText) tokenisedText <- gsub("ṃd-IVA-m-", "ṃ-DIV-am-", tokenisedText) tokenisedText <- gsub("d-AṆḌ-", "-DAṆḌ-", tokenisedText) tokenisedText <- gsub("-ad-VYAP-a-ROPAY-", "-a-DV-y-APROPAY-", tokenisedText) tokenisedText <- gsub("-a-NYAD-d-ĀNAS-aha-", "-anya-DĀN-a-SAHA", tokenisedText) tokenisedText <- gsub("d-ĀSĪD-ā-SAKARMA-", "-DĀS-ī-DĀS-a-KARMA-", tokenisedText) tokenisedText <- gsub("-JĀTIK-ṣaṇ", "-JĀT-i-KṢAṆ-", tokenisedText) tokenisedText <- gsub("od-ĀRĀM-", "-ODĀR-ām", tokenisedText) tokenisedText <- gsub("as-YĀS-ad-VITARK-", "asyā-SAD--VITARK-", tokenisedText) tokenisedText <- gsub("-SUMA-had-", "su-MAH-ad", tokenisedText) tokenisedText <- gsub("-ā-NĀMANTIK-", "-ānām-ANTIK-", tokenisedText) tokenisedText <- gsub("d-AŚAG-uṇ", "-DAŚ-a-GUṆ-", tokenisedText) tokenisedText <- gsub("ād-ĀNAT--YĀG-", "-ĀDĀN-a-TYĀG-", tokenisedText) tokenisedText <- gsub("-CCHAN-d", "-CCHAND-", tokenisedText) tokenisedText <- gsub("-SARVAR-d-DHIK--ĀRY-", "-SARVA-RDDH-i-KĀRY-", tokenisedText) tokenisedText <- gsub("-SAV-id", "-SAVID-", tokenisedText) tokenisedText <- gsub("d-OṢASY-", "-DOṢ-asy", tokenisedText) tokenisedText <- gsub("-UPĀ-d-ĀNAMIT-", "--UPĀDĀN-am it", tokenisedText) tokenisedText <- gsub("d-AUṢṬH-alya", "-DAUṢṬHALY-a", tokenisedText) tokenisedText <- gsub("-UPĀ-d-ĀNAMIT-", "-UPĀDĀN-am it", tokenisedText) tokenisedText <- gsub("d-ĀNĀDAY-", "-DĀN-ĀDAY-", tokenisedText) tokenisedText <- gsub("d-ŪRAŚ-", "-DŪRAŚ-", tokenisedText) tokenisedText <- gsub("d-ANTAT-ā", "-DANT-atā", tokenisedText) tokenisedText <- gsub("-ā-DHIK-ār", "-ĀDHIKĀR-", tokenisedText) tokenisedText <- gsub("-ŚRADDH-adh-ĀNAT-", "-ŚRADDHADHĀN-at", tokenisedText) tokenisedText <- gsub("-yādh-YĀVAS-ati", "-y-ĀDHYĀVAS-ati", tokenisedText) tokenisedText <- gsub("rit-ĪYAMā-NAM-", "-RITĪY-yamānam", tokenisedText) tokenisedText <- gsub("p-HALA-", "-PHAL-a", tokenisedText) tokenisedText <- gsub("pu-TRASY-", "-PUTR-asy", tokenisedText) tokenisedText <- gsub("va-RAM-", "-VAR-am ", tokenisedText) tokenisedText <- gsub("uda-RAM-", "-UDAR-am", tokenisedText) tokenisedText <- gsub("-ANANT-a-RAM-", "-ANANTAR-am", tokenisedText) tokenisedText <- gsub("-PŪRVAT-a-RAM-", "-PŪRV-ataram ", tokenisedText) tokenisedText <- gsub("-PUNARAP-a-REṆ-", "-PUNARAPAR-eṇ", tokenisedText) tokenisedText <- gsub("-PUNARAP-i", "-PUNARAPI-", tokenisedText) tokenisedText <- gsub("-PUNARAP-ara", "-PUNARAPAR-a", tokenisedText) tokenisedText <- gsub("-PUNARAP-a-RAM-", "-PUNARAPAR-am", tokenisedText) tokenisedText <- gsub("-PUNARAP--RAMĀṆ-", "-PUNAR-a-PRAMĀṆ-", tokenisedText) tokenisedText <- gsub("-NĀV-a-TARANT-", "n-ĀVATAR-ant", tokenisedText) tokenisedText <- gsub("-(.*?)IṢYAT-([ei])", "\\1-iṣyat\\2", tokenisedText) tokenisedText <- gsub("ābh-AYAM-", "-ĀBHAY-am", tokenisedText) tokenisedText <- gsub("bh-AYAṂ-", "-BHAY-aṃ", tokenisedText) tokenisedText <- gsub("h-ETVA-", "-HET-va", tokenisedText) tokenisedText <- gsub("n-ĀNART-h-", "n^ān-ARTH-", tokenisedText) tokenisedText <- gsub("-ĀNART-h-", "ān-ARTH-", tokenisedText) tokenisedText <- gsub("-ISTR-ibh", "is-TRI-bh", tokenisedText) tokenisedText <- gsub("rā-JAMA-h-ĀMĀTY-", "-RĀJ-a-MAHĀMĀT-y", tokenisedText) tokenisedText <- gsub("-ā-YĀDA-ra-JĀT-ara-JĀT-a", "-āy-ĀDAR-a-JĀT-a", tokenisedText) tokenisedText <- gsub("āpah-ĀRIṆ-", "-APAHĀR-iṇ", tokenisedText) tokenisedText <- gsub("h-ĀSAY-itu", "-HĀSAY-itu", tokenisedText) tokenisedText <- gsub("apah-ĀPAYIT-u", "-APAHĀPAY-itu", tokenisedText) tokenisedText <- gsub("-SAMANV-āh-RIY-", "-SAMANVAHRIY-", tokenisedText) tokenisedText <- gsub("h-ASITAK-r-ĪḌIT-", "-HASIT-a-KRĪḌIT-", tokenisedText) tokenisedText <- gsub("bh-ĀNAS-ya", "-BHĀN-asya", tokenisedText) tokenisedText <- gsub("h-ĪNĀN-ā([ṃmñ])", "-HĪN-ānā\\1", tokenisedText) tokenisedText <- gsub("h-ETVA-", "-HET-va", tokenisedText) tokenisedText <- gsub("bh-AVABANDH-a-NĀN-", "-BHAV-a-BANDHAN-ān", tokenisedText) tokenisedText <- gsub("bh-ĀṆAK-", "-BHĀṆAK-", tokenisedText) tokenisedText <- gsub("bh-RAMAT-", "-BHRAM-at", tokenisedText) tokenisedText <- gsub("bh-RĀJAT-", "-BHRĀJ-at", tokenisedText) tokenisedText <- gsub("v-ARṆAV-ādī", "-VARṆA-VĀD-ī", tokenisedText) tokenisedText <- gsub("h-ART-", "-HART-", tokenisedText) tokenisedText <- gsub("āh-VĀNĀY-", "-ĀHVĀN-āy", tokenisedText) tokenisedText <- gsub("āh-ĀRAM-", "-ĀHĀR-am", tokenisedText) tokenisedText <- gsub("-PRAJÑAPT-ivā-DASY-", "-PRAJÑAPT-i-VĀD-asy", tokenisedText) tokenisedText <- gsub("-?eṣ-VASY-", "-eṣv asy", tokenisedText) tokenisedText <- gsub("-STH-ā-NAMASY-", "-STHĀN-am asy", tokenisedText) tokenisedText <- gsub("-?at-VASY-", "-atvasy", tokenisedText) tokenisedText <- gsub("Y-a-TYASY-", "Y-aty asy", tokenisedText) tokenisedText <- gsub("--IRASY-", "-ir asy", tokenisedText) tokenisedText <- gsub("ya-VĀṄK--URASY-", "-YAVĀ-ṅ-KUR-asy", tokenisedText) tokenisedText <- gsub("nā-NYASY-", "n^ānyasy", tokenisedText) tokenisedText <- gsub("a-NYASY-", " anyasy", tokenisedText) tokenisedText <- gsub("-i-RAP-ūr-VASY", "-ir-a-PŪRV-asy", tokenisedText) tokenisedText <- gsub("-NID-ā-NAMASY-", "-NIDĀNA-m asy", tokenisedText) tokenisedText <- gsub("-?sū-TRASY-", "-SŪTR-asy", tokenisedText) tokenisedText <- gsub("ya-IRASY-", "yair asy", tokenisedText) tokenisedText <- gsub("-āk-ĀRASY-", "-ĀKĀR-asy", tokenisedText) tokenisedText <- gsub("k-ĀRASY-", "-KĀR-asy", tokenisedText) tokenisedText <- gsub("k-ĀRASY-", "-KĀR-asy", tokenisedText) tokenisedText <- gsub("-ETADAVOCA-", "-ETAD- -AVOCA-", tokenisedText) tokenisedText <- gsub("-AITADAVOCA-", "-AITAD- -AVOCA-", tokenisedText) tokenisedText <- gsub("-STRI-bh", "s-TRI-bh", tokenisedText) tokenisedText <- gsub("an-YĀN-([iy])", "anyān\\1", tokenisedText) tokenisedText <- gsub("-YĀN-([iy])", "yān\\1", tokenisedText) tokenisedText <- gsub("([-| ])-YĀN-([-| ])", "yān", tokenisedText) #tokenisedText <- gsub("([yt])-AVĀ", "\\1av-Ā", tokenisedText) tokenisedText <- gsub("K-ar-MAṆ-", "-KARM-aṇ", tokenisedText) tokenisedText <- gsub("itye", "ity e", tokenisedText) tokenisedText <- gsub("asyet([iy])", "asya et\\1 ", tokenisedText) tokenisedText <- gsub("itīda([mṃ])", "it^īda\\1 ", tokenisedText) tokenisedText <- gsub("itīme", "it^īme ", tokenisedText) tokenisedText <- gsub("itīmau", "it^īmau ", tokenisedText) tokenisedText <- gsub("itīmān([īyi])", "it^īmān\\1", tokenisedText) #tokenisedText <- gsub("(an?t)īt([īyi])", "\\1^īt\\2", tokenisedText) tokenisedText <- gsub("-([āa])mih", "-\\1m ih", tokenisedText) tokenisedText <- gsub("([āa])ñc", "\\1ñ c", tokenisedText) tokenisedText <- gsub("asyeh-Ā", "asy^eh-Ā", tokenisedText) tokenisedText <- gsub("asyeha", "asy^eha", tokenisedText) tokenisedText <- gsub("s-ŪCYAT-", "-SŪCY-at", tokenisedText) tokenisedText <- gsub("s-ŪCYAT-", "-SŪCY-at", tokenisedText) tokenisedText <- gsub("-CĀP-([iy])", " c^āp\\1 ", tokenisedText) tokenisedText <- gsub("-CĀP-ī", "cāpī", tokenisedText) tokenisedText <- gsub("p-ARI-", "-PARI-", tokenisedText) tokenisedText <- gsub("-PARI--", "-PARI", tokenisedText) tokenisedText <- gsub("-DHARMAPĀRYAY-", "-DHARM-a-PĀRYAY-", tokenisedText) tokenisedText <- gsub("-([iī])-TAVY-", "-\\1tavy", tokenisedText) tokenisedText <- gsub("t-VĀY-", "tvāy", tokenisedText) tokenisedText <- gsub("Y--ATĪT-([iyī])", "Y-atīt\\1", tokenisedText) tokenisedText <- gsub("-KARTUKĀM-", "-KART-u-KĀM-", tokenisedText) tokenisedText <- gsub("ramit([oaā])", "-RAMIT-\\1", tokenisedText) tokenisedText <- gsub("([yv])aman([oaā])", "-\\1AMAN-\\1", tokenisedText) tokenisedText <- gsub("vacar([oaā])", "-VACAR-\\1", tokenisedText) tokenisedText <- gsub("avan([oaā])", "-AVAN-\\1", tokenisedText) tokenisedText <- gsub("tamar([oaā])", "-TAMAR-\\1", tokenisedText) tokenisedText <- gsub("tāmar([oaā])", "-TĀMAR-\\1", tokenisedText) tokenisedText <- gsub("tasar([oaā])", "-TASAR-\\1", tokenisedText) tokenisedText <- gsub("-APADĀN-([iyī])", "a-PAD-ān\\1", tokenisedText) tokenisedText <- gsub("pa-RAS-y", "-PAR-asy", tokenisedText) tokenisedText <- gsub(" t-VṚDDH-", " tv-ṚDDH-", tokenisedText) tokenisedText <- gsub("t-RIB-h", " -TRI-bh", tokenisedText) tokenisedText <- gsub("-VĪTARĀG-", "-VĪT-a-RĀG-", tokenisedText) tokenisedText <- gsub("-TRIVID-", "-TRI--VIDH-", tokenisedText) tokenisedText <- gsub("-DVIVID-", "-DV-i-VIDH-", tokenisedText) tokenisedText <- gsub("-BHAYABHĪT-", "-BHAY-a-BHĪT-", tokenisedText) tokenisedText <- gsub("-GOTRASTH-", "-GOTR-a-STH-", tokenisedText) tokenisedText <- gsub("-GOTRASTHĀN-", "-GOTR-a-STHĀN-", tokenisedText) tokenisedText <- gsub("-BHAGAVĀNĀHA-", "-BHAGAV-ān-ĀH-a", tokenisedText) tokenisedText <- gsub("-TADDHET-", "-TAD-d-HET-", tokenisedText) #tokenisedText <- gsub("-ABDHĀT-", "-AB-DHĀT-", tokenisedText) tokenisedText <- gsub("-VĀYUDHĀT-", "-VĀY-u-DHĀT-", tokenisedText) tokenisedText <- gsub("-TEJODHĀT-", "-TEJ-o-DHĀT-", tokenisedText) tokenisedText <- gsub("-APTEJ-", "-AP-TEJ-", tokenisedText) tokenisedText <- gsub("-TEJOJAL-", "-TEJ-o-JAL-", tokenisedText) tokenisedText <- gsub("-ITYUCYAT-", "ity -UCYAT-", tokenisedText) tokenisedText <- gsub("-ITYUCYANT-", "ity -UCYANT-", tokenisedText) tokenisedText <- gsub("([-| ])nat([uv])", "\\1na t\\2", tokenisedText) tokenisedText <- gsub("itye", "ity e", tokenisedText) tokenisedText <- gsub("-MIT-([iīy])", "m it\\1", tokenisedText) tokenisedText <- gsub("([ou])rit([iy])", "\\1r it\\2", tokenisedText) tokenisedText <- gsub("-(a?ir)it([iy])", "\\1 it\\2", tokenisedText) tokenisedText <- gsub("-([ui])rit([iy])", "\\1r it\\2", tokenisedText) tokenisedText <- gsub("asyet([iy])", "asya et\\1 ", tokenisedText) tokenisedText <- gsub("-([aā])vima([uv])", "-\\1v ima\\2", tokenisedText) tokenisedText <- gsub("itīda([mṃ])", "it^īda\\1 ", tokenisedText) tokenisedText <- gsub("itīm([āe])", "it^īme ", tokenisedText) tokenisedText <- gsub("itīmau", "it^īmau ", tokenisedText) tokenisedText <- gsub("itīmān([īyi])", "it^īmān\\1", tokenisedText) tokenisedText <- gsub("(an?t)īt([īyi])", "\\1^īt\\2", tokenisedText) tokenisedText <- gsub("-([āa])mih", "-\\1m ih", tokenisedText) tokenisedText <- gsub("([āa])ñc", "\\1ñ c", tokenisedText) tokenisedText <- gsub("-([āa])nn-Ā", "-\\1n n-Ā", tokenisedText) tokenisedText <- gsub("at-VĀD-", "atvād ", tokenisedText) tokenisedText <- gsub("-eṣv([āaeiī])", "-eṣv \\1", tokenisedText) tokenisedText <- gsub("-eṣv([āaeiī])", "-eṣv \\1", tokenisedText) tokenisedText <- gsub("[-| ]d-VAYA([MṂ])-", "-DV-aya\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-DVAY-o", "-DV-ayo", tokenisedText) #tokenisedText <- gsub("d-V([IY])", "-DV-\\L\\1-", tokenisedText, ignore.case=FALSE, perl=TRUE) #tokenisedText <- gsub("d-VĪ", "-DV--Ī", tokenisedText) tokenisedText <- gsub("dv-ĀBHY-", "-DV-ābhy", tokenisedText) tokenisedText <- gsub("-APYUKT-", "-APY--UKT-", tokenisedText) tokenisedText <- gsub("p-ĀDĀN-ān", "-PĀDĀN-ān", tokenisedText) tokenisedText <- gsub("p-ĀPAMITY-", "-PĀP-am ity", tokenisedText) tokenisedText <- gsub("ap-ETA([MṂ])-", "-APET-a\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("ś-EṢĀṆ-", "-ŚEṢ-āṇ", tokenisedText) tokenisedText <- gsub("ś-ĀNTAT-v", "-ŚĀNT-atv", tokenisedText) tokenisedText <- gsub("ś-UBHAY-", "-ŚUBH-ay", tokenisedText) tokenisedText <- gsub("ś-UDDHĀN--", "-ŚUDDH-ān-", tokenisedText) tokenisedText <- gsub("kṣ-AYA([MṂ])-", "-KṢAY-a\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-ṣ-VEV-", "-ṣv ev", tokenisedText) tokenisedText <- gsub("[-| ]se-NĀD-", "-SEN-ād", tokenisedText) tokenisedText <- gsub("iṣ-VEV-", "iṣv ev", tokenisedText) tokenisedText <- gsub("b-UDDHIT-v", "-BUDDH-itv", tokenisedText) tokenisedText <- gsub("b-UDDHIT--VĀC-", "-BUDDH-itvāc", tokenisedText) tokenisedText <- gsub("b-UDDHIT--VĀD-", "-BUDDH-itvād", tokenisedText) tokenisedText <- gsub("b-ĀLAS-", "-BĀL-as", tokenisedText) tokenisedText <- gsub("b-HĪK-ṣ", "-BHĪKṢ-", tokenisedText) tokenisedText <- gsub("b-UDDHAV-", "-BUDDH-av", tokenisedText) tokenisedText <- gsub("b-HAS-m", "-BHASM-", tokenisedText) tokenisedText <- gsub("bh-AVAŚ-", "-BHAV-aś", tokenisedText) tokenisedText <- gsub("bh-AVAY-", "-BHAV-ay", tokenisedText) tokenisedText <- gsub("bh-AVABH-og", "-BHAV-a-BHOG-", tokenisedText) tokenisedText <- gsub("-SADVĀC-", "-SAD--VĀC-", tokenisedText) tokenisedText <- gsub("y-UṢṂ-", "-YUṢṂ-", tokenisedText) tokenisedText <- gsub("([kty])-(ASMIN)-", " \\1\\L\\2 ", tokenisedText, perl = TRUE) tokenisedText <- gsub("i-MĀNY-", " imāny ", tokenisedText) tokenisedText <- gsub("-EKAPHAL-", " eka-PHAL-", tokenisedText) tokenisedText <- gsub("ma-HANT-", "-MAH-ant", tokenisedText) tokenisedText <- gsub("([ty])-ĀVATSAR-v", "\\1āvat -SARV-", tokenisedText) tokenisedText <- gsub("-i-RAS-m([iīy])", "-ir asm\\1", tokenisedText) tokenisedText <- gsub("-i-RAS-ast([uv])", "-ir ast\\1", tokenisedText) tokenisedText <-gsub("-ĀN-([^a])", "-ān\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("n-AVABH-ū-MIK-", "-NAV-a-BHŪM-ik", tokenisedText) tokenisedText <- gsub("-BHAVAPHAL-", "-BHAV-a-PHAL-", tokenisedText) tokenisedText <- gsub("-BHĀVALAKṢAṆ-", "-BHĀV-a-LAKṢAṆ-", tokenisedText) tokenisedText <- gsub("-BHĀVĀNTAR-", "-BHĀV-ĀNTAR-", tokenisedText) tokenisedText <- gsub("-BHĀVĀKṢAY-", "-BHĀV-ĀKṢAY-", tokenisedText) tokenisedText <- gsub("bh-AVASTR-i-", "-BHAV-as-TRI-", tokenisedText) tokenisedText <- gsub("bh-AVASTH-", "-BHAV-a-STH-", tokenisedText) tokenisedText <- gsub("bh-AVAGRAHAṆ-", "-BHAV-a-GRAHAṆ-", tokenisedText) tokenisedText <- gsub("bh-AVAGRAH-", "-BHAV-a-GRAH-", tokenisedText) tokenisedText <- gsub("ph-ĀLĀN-", "-PHĀL-ān", tokenisedText) tokenisedText <- gsub("-SARVATRAGAHET-", "-SARVATRAG-a-HET", tokenisedText) tokenisedText <- gsub("-HETUR-", "-HET-ur ", tokenisedText) tokenisedText <- gsub("-ARHAN-", "-ARH-an", tokenisedText) tokenisedText <- gsub("-SARVATR-ag", "-SARVATRAG-", tokenisedText) tokenisedText <- gsub("-PUNARMAYĀ-", "-PUNAR--MAY-ā", tokenisedText) tokenisedText <- gsub("-ALP-ā-NAL-p", "-ALP-ān-ALP-", tokenisedText) tokenisedText <- gsub("pa-RAS-y", "-PAR-asy", tokenisedText) tokenisedText <- gsub("ta-DĀSY-", "-TAD-āsy", tokenisedText) tokenisedText <- gsub("-a-MADH-ar", "-am-ADHAR-", tokenisedText) tokenisedText <- gsub("-KAT-v", "katv", tokenisedText) tokenisedText <- gsub("-BHEṢYAT-", "-BHEṢ-yat", tokenisedText) tokenisedText <- gsub("-BHAVAMARD([AN])?-", "-BHAV-a-MARD\\1-", tokenisedText) tokenisedText <- gsub("-BHAVARĀG-", "-BHAV-a-RĀG-", tokenisedText) tokenisedText <- gsub("-BHAVAGAT-", "-BHAV-a-GAT-", tokenisedText) tokenisedText <- gsub("-BHAVAGAMAN-", "-BHAV-a-GAMAN-", tokenisedText) tokenisedText <- gsub("bh-ĀGAM-", "-BHĀG-am", tokenisedText) # tokenisedText <- gsub("bh-([ĀA])VA(.*?)-", "-BH\\1V-a-\\2", tokenisedText) tokenisedText <- gsub("-RAM-it([iīy])", "ram it\\1", tokenisedText) tokenisedText <- gsub("dū-RAM-it", "-DŪR-am", tokenisedText) tokenisedText <- gsub("ta-RAM-", "taram", tokenisedText) tokenisedText <- gsub("na-RAM-", "-NAR-am", tokenisedText) tokenisedText <- gsub("-([ĀA])ṂŚ-([^āaeu])", "\\L\\1ṃś \\2", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-a-MĀT--MĀN-", "-am-ĀTM-ān", tokenisedText) tokenisedText <- gsub("-a-MĀT--MAN-", "-am-ĀTM-an", tokenisedText) tokenisedText <- gsub("-ĀTMĀNA([MṂ])-", "-ĀTM-āna\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-ĀTM([ĀA]N)-", "-ĀTM-\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("([iīe])-MĀN-", "\\1mān", tokenisedText) tokenisedText <- gsub("āt-MĀN-", "-ĀTM-ān", tokenisedText) tokenisedText <- gsub("Y-a-MĀN-", "Y-amān", tokenisedText) tokenisedText <- gsub("ya-MĀN-", "yamān", tokenisedText) tokenisedText <- gsub("kā-MĀN-", "-KĀM-ān", tokenisedText) tokenisedText <- gsub("-NĀR-HANT-", "n-ĀRH-ant", tokenisedText) tokenisedText <- gsub("-PAṬ--HANT-", "-PAṬH-ant", tokenisedText) tokenisedText <- gsub("--AYAN- ", "-ayan ", tokenisedText) tokenisedText <- gsub("-SARV-āv([aā]n?t) ", "-SARVĀV-\\1", tokenisedText) tokenisedText <- gsub("([ĪIA])K--ALP-", "-\\L\\1-KALP-", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-TARU-", "-TAR-u", tokenisedText) tokenisedText <- gsub("VART-T(.*?)-", "VART-t\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("--ĀPAY-", "ĀPAY", tokenisedText) tokenisedText <- gsub("-YATTARH([IY])-", "yat -TARH\\1-", tokenisedText) tokenisedText <- gsub("-YATT([UV])-", "yat t\\L\\1 ", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("-YATTA([TD])", "yat -TA\\1-", tokenisedText) tokenisedText <- gsub("-YATTŪKT-", "yat t-ŪKT-", tokenisedText) tokenisedText <- gsub("([^aā])sy-ĀDIT-", "\\1 syād it", tokenisedText) tokenisedText <- gsub("j-ĀYAN-", "-JĀY-an", tokenisedText) tokenisedText <- gsub("([āa])-NYĀS-([uv])", "\\U\\1NY-as\\L\\2", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("va-DAT-([iīe])", "-VAD-at\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("sū-TRAMID-a([mṃ])", "-SŪTR-am -IDA\\U\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("j-ĀTIṢ-([uv])", "-JĀT-iṣ\\1", tokenisedText) tokenisedText <- gsub("ta-TRAST-h", "-TATRA- -STH-", tokenisedText) tokenisedText <- gsub("-ATĪT-([iīy])", "at^īt\\1", tokenisedText) tokenisedText <- gsub("yat-HĀN-y", "yath-ĀNY-", tokenisedText) tokenisedText <- gsub("([āa])-DHVAS-u", "-\\U1DHV-asu", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("ra-MAN-ti", "-RAM-anti", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("ka-ROṢ-i", "-KAR-oṣi", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("ek([aāe])", "-EK-\\1", tokenisedText) tokenisedText <- gsub("e-KĀM-", "-EK-ām", tokenisedText) tokenisedText <- gsub("-MAN-([^aoā])", "man\\1", tokenisedText) tokenisedText <- gsub("-NAHY-", " na hy ", tokenisedText) #tokenisedText <- gsub("-NAM-([^o|as])", "nam\\1", tokenisedText) tokenisedText <- gsub("([ |-])nam([aḥ[o|as|aś])", "\\1-NAM-\\2", tokenisedText) tokenisedText <- gsub("syāda-", "syād a-", tokenisedText) tokenisedText <- gsub("([AĀIĪ])N-ir-HĀR-a-", "-\\L\\1-NIRHĀR-a", tokenisedText) tokenisedText <- gsub("-SAN-t([iy])", "sant\\1", tokenisedText) tokenisedText <- gsub("(it[iy])", " \\1 ", tokenisedText, ignore.case=FALSE, perl=TRUE) # tokenisedText <- gsub("([^\\^]īt[iy])", "^\\1 ", tokenisedText, ignore.case=FALSE, perl=TRUE) #tokenisedText <- gsub("(ast)([^iīyuūv-])", "\\1 \\2", tokenisedText, ignore.case=FALSE, perl=TRUE) #tokenisedText <- gsub("-(it)?([āa])([mṃ])(?!i |[śsṣ]|ī[th])", "-\\1\\2\\3 \\4", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("--([ĀA])Ṃ([ŚS])-", "-\\L\\1ṃ\\L\\2 ", tokenisedText, ignore.case=FALSE, perl=TRUE) # check tokenisedText <- gsub("([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])īt([iy])", "\\1^īt\\2 ", tokenisedText, ignore.case=FALSE, perl=TRUE) tokenisedText <- gsub("([āaiīuūmṃ])([āaiīuūmṃ])([mṃ])? st", "\\1\\2s t", tokenisedText,perl=TRUE) tokenisedText <- gsub("([āaiīuūmṃ])([āaiīuūmṃ])([mṃ])?st", "\\1\\2s t", tokenisedText,perl=TRUE) tokenisedText <- gsub("upa iti", "-UPAI-ti", tokenisedText,perl=TRUE) tokenisedText <- gsub("upaiti", "-UPAI-ti", tokenisedText,perl=TRUE) tokenisedText <- gsub("AT-v([aāe])", "-atv\\1", tokenisedText,perl=TRUE) tokenisedText <- gsub("-āhuḥ", "-ĀH-uḥ", tokenisedText,perl=TRUE) tokenisedText <- gsub("-an-AYA([MṂ])", "-a-NAY-a\\L\\1", tokenisedText,perl=TRUE) tokenisedText <- gsub("āl-AYA([MṂ])", "-ĀLAY-a\\L\\1", tokenisedText,perl=TRUE) tokenisedText <- gsub("d-VIPRAKĀR-", "-DV-i-PRAKĀR-", tokenisedText,perl=TRUE) tokenisedText <- gsub("d-VAYA([MṂ])-", "-DVAY-a\\L\\1", tokenisedText,perl=TRUE) tokenisedText <- gsub("ad-VAYA([MṂ])-", "a-DVAY-a\\L\\1", tokenisedText,perl=TRUE) tokenisedText <- gsub("t-VIH-a", " tv iha ", tokenisedText,perl=TRUE) tokenisedText <- gsub("t-VAN-a-", " tv an-a", tokenisedText,perl=TRUE) tokenisedText <- gsub("t-VAN--", " tv an-", tokenisedText,perl=TRUE) tokenisedText <- gsub("([ |-])e-KANAY-a-", "\\1-EK-a-NAY-a", tokenisedText,perl=TRUE) tokenisedText <- gsub("-KĀŚ-([ |-])", "kaś\\1", tokenisedText,perl=TRUE) tokenisedText <- gsub("āl-AYAM-ano", "-ĀLAY-a-MAN-o", tokenisedText,perl=TRUE) tokenisedText <- gsub("d-VIPRAKĀR-", "-DV-i-PRAKĀR-", tokenisedText,perl=TRUE) tokenisedText <- gsub("-ARHAN-t", "-ARH-ant", tokenisedText,perl=TRUE) tokenisedText <- gsub("y-ADY-", "yady", tokenisedText,perl=TRUE) tokenisedText <- gsub("-MANO-", "-MAN-o", tokenisedText,perl=TRUE) tokenisedText <- gsub("([^Ā])(YAN?T)-([iyev])", "\\1-\\L\\2\\3", tokenisedText,perl=TRUE) tokenisedText <- gsub("([^Ā])(YAN?T)-a ([iae])", "\\1-\\L\\2 \\3", tokenisedText,perl=TRUE) tokenisedText <- gsub("AN-(t[iyeoau])", "-an\\1", tokenisedText) #tokenisedText <- gsub("-UPAYĀT-([iyeoau])", "APAY-āt\\1", tokenisedText) tokenisedText <- gsub("-SUKH-adā([mṃ])", "-SUKH-a-D-ā\\1", tokenisedText) tokenisedText <- gsub(" anyaḥ ", " -ANY-aḥ ", tokenisedText) tokenisedText <- gsub(" anyo ", " -ANY-o ", tokenisedText) tokenisedText <- gsub("-YĀBH-yā([ṃm])", " -yābhyā\\1 ", tokenisedText) tokenisedText <- gsub("-YĀBH-yo", " -yābhyo ", tokenisedText) tokenisedText <- gsub("-YĀBH-ya([ḥsśs])", " -yābhy\\1 ", tokenisedText) tokenisedText <- gsub("-ĀRYĀṆ-āṃ","-ĀRY-āṇāṃ", tokenisedText) tokenisedText <- gsub("-YĀNAT-r-A", "-YĀN-a-TRA", tokenisedText) tokenisedText <- gsub("nai-VĀS-t([īyi])", "n^aiv^āst\\1", tokenisedText) tokenisedText <- gsub("-VĀP-([iīy])", "v^āp\\1", tokenisedText) tokenisedText <- gsub("-ĀTMĀNA([ṂM])-", "-ĀTM-āṇa\\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-ād(an?)-", "-ād \\1-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-ād-", "-ād -", tokenisedText, perl=TRUE) tokenisedText <- gsub("-ANĀDIKĀL-", "an-ĀD-i-KĀL-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-DI([GK])--", "-DI-\\L\\1-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-DAŚADI-([gkṅ])", "-DAŚ-a-DI-\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("ma-NY-(a?nt)", "-MAN-y\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("hrī", "-HR-ī", tokenisedText, perl=TRUE) tokenisedText <- gsub("iy-AMATR-([aā])", "iyam atr\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-EKATV-", "-EK-atv", tokenisedText, perl=TRUE) tokenisedText <- gsub("ITV-([aā])", "-itv\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-SADASA([TDNC])-", "-SAD-a-SA\\1-", tokenisedText, perl=TRUE) tokenisedText <- gsub("([ |-])tri-", "\\1-TR-i-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-STR-ibh", "s-TR-ibh", tokenisedText, perl=TRUE) tokenisedText <- gsub("tri-", "-TR-i-", tokenisedText, perl=TRUE) tokenisedText <- gsub("triṣ([uv])", "-TR-iṣ\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-ādau", "-ĀD-au", tokenisedText) tokenisedText <- gsub("-ādi", "-ĀD-i", tokenisedText) tokenisedText <- gsub("-SVA-", "-SV-a-", tokenisedText) tokenisedText <- gsub("-NAMAS-", "-NAM-as", tokenisedText) tokenisedText <- gsub("-NĀN--([^Ā|O|E])", "nān-\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("antav([āa][nṃ])", "-ANT-av\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-ŚĀŚVAT-ānt([āae])", "-ŚĀŚVAT--ĀNT-\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-UCCHED-ānt([āae])", "-UCCHED--ĀNT-\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "\\1-\\2", tokenisedText) tokenisedText <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1-\\2", tokenisedText) tokenisedText <- gsub("([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1\\2", tokenisedText) tokenisedText <- gsub("---", "--", tokenisedText) tokenisedText <- gsub(" --", " -", tokenisedText) tokenisedText <- gsub("-- ", "- ", tokenisedText) tokenisedText <- gsub("-- ", "- ", tokenisedText) tokenisedText <- gsub("-- ", "- ", tokenisedText) tokenisedText <- gsub("--([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "-\\1", tokenisedText) tokenisedText <- gsub("\\^\\^", "^", tokenisedText) tokenisedText <- gsub("\\^-", "-^", tokenisedText) ##tokenisedText <- gsub("(<.*?>)", "\\L\\1", tokenisedText, ignore.case=FALSE, perl=TRUE) # move this to gramrel&humans cleaner tokenisedText <- gsub(" +", " ", tokenisedText) # write(tokenisedText, file= paste0(TextName,"_Cleaned.txt")) return(tokenisedText) } #### this version if to be used just before gramrel&humans FinalCleaner <- function(tokenisedText, TextName){ #tokenisedTextname <- deparse(substitute(tokenisedText)) ## I comment the following lines for the evaluation test, so that it harmonises with conventions I used for 'gold' # remove this lines after done with evaluation tokenisedText <- gsub("-(AP[IY])- ", " \\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-(ĀP[IY])- ", "^\\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("api", " api ", tokenisedText, perl=TRUE) tokenisedText <- gsub("([āa])py([^a])","\\1py \\2", tokenisedText) tokenisedText <- gsub("([āao]n?)ty(?![aā][imṃnñḥśs]|[āao][ |-]|[ie][rḥ])([aeiouāīū])","\\1ty \\2", tokenisedText, perl=TRUE) tokenisedText <- gsub("-([iīa]t)-(V[AĀ]D)-(?![aāe])", "-\\1\\L\\2", tokenisedText,perl=TRUE) tokenisedText <- gsub("ta-RAT-", "tarat", tokenisedText) tokenisedText <- gsub("([aāiī])-VĀDIT-([iīy])", "\\1tvād it\\2 ", tokenisedText) tokenisedText <- gsub("t-VĀY-", "tvāy", tokenisedText) tokenisedText <- gsub("-([aāiī])t-VĀC-(\\s?c)", "-\\1tvāc\\2", tokenisedText) tokenisedText <- gsub("-EVA-", " eva ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-(EVA[MṂ])-", " \\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-(AIVA[MṂ]?)-", "^\\L\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-AIV-", "^aiv", tokenisedText, perl=TRUE) tokenisedText <- gsub("-TATRAIV-(a[mṃ]?)", " tatr^aiv\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub(" e-VĀNY-", " ev-ĀNY-", tokenisedText, perl=TRUE) tokenisedText <- gsub(" e-VĀSY-", " ev^āsy", tokenisedText, perl=TRUE) tokenisedText <- gsub("e-VĀS-aṃ-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", " ev-ĀSAṂ\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-SAC-(e[tdn][ |-])", " sac\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-TATR-", "tatr", tokenisedText, perl=TRUE) tokenisedText <- gsub("-ITYATR-", "ity atr", tokenisedText, perl=TRUE) tokenisedText <- gsub("-TATR--EDA([MṂ])-", " tatr^eda\\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("c(ai[vn]a[mṃ]?)", " c^\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("c(ān-[AĀIĪUŪOE])", " c^\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])evā", "\\1 evā", tokenisedText, perl=TRUE) tokenisedText <- gsub("-EV-", "ev^", tokenisedText, perl=TRUE) tokenisedText <- gsub("vet([iy])", "v^et\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-ast([uv])", "-as t\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub(" yast([uv])", "yas t\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub(" as t", " ast", tokenisedText, perl=TRUE) tokenisedText <- gsub(" ast([aāoe][ḥśsmṃnñ]?)", " -AST-\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("tv(?!a|ā|e[nṇ]|-)(.)", "tv \\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("([IĪ]?[TṬ])AVY-", "-\\L\\1avy", tokenisedText, perl=TRUE) tokenisedText <- gsub("-PUNA([RḤSŚ])-", " puna\\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-PUNARAP([IĪY])-", " punar ap\\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-PUNARAPARA([MṂ]?)-", " punar apara\\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-BHAGAVĀNĀH-a", "-BHAGAV-ān-ĀH-a", tokenisedText, perl=TRUE) tokenisedText <- gsub("-TATRAIV--", " tatr^aiv^-", tokenisedText, perl=TRUE) tokenisedText <- gsub("apīha", " ap^iha ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-([TY]ATHĀ)-", " \\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-TADYATH-", " tadyath ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-TADYATHĀP-", " tadyathāp ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-at-VĀC-c", "-atvāc c", tokenisedText, perl=TRUE) tokenisedText <- gsub("-at-VĀC--C", "-atvāc -C", tokenisedText, perl=TRUE) tokenisedText <- gsub("-a-YAV-at", "ayavat", tokenisedText, perl=TRUE) tokenisedText <- gsub("-UPĀYA([ṂM])-", "-UPĀY-a\\L\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-ĀNT-([eiy])", "-ānt\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub(" āyu[ḥsṣ]", " -ĀY-u\\1 ", tokenisedText) tokenisedText <- gsub("ojo", " -OJ-o", tokenisedText) tokenisedText <- gsub("oja[sḥś]", " -OJ-a\\1 ", tokenisedText) tokenisedText <- gsub("-CATU([RḤSŚṢ])-", "-CATU-\\L\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-atet([yi])", " -at^et\\1", tokenisedText) tokenisedText <- gsub("IT--VĀC-c", "-itvāc c", tokenisedText) tokenisedText <- gsub("p-RAṆ-", "-PRAṆ-", tokenisedText) tokenisedText <- gsub("p-ĀPAJ-", "-PĀP-a-J-", tokenisedText) tokenisedText <- gsub(" t-VAN-ena ", " tv anena ", tokenisedText) tokenisedText <- gsub("-ā-RAM-([ |-])", "-āram\\1", tokenisedText) tokenisedText <- gsub("-āṃś([aāouū][mṃnñḥśsr]? )", "--ĀṂŚ-\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub(" u-PA", "-UPA", tokenisedText) tokenisedText <- gsub(" e-([TASMĀT|TASMIN])", " -E\\1", tokenisedText) tokenisedText <- gsub("AN-i-ROD-h","-a-NIRODH-", tokenisedText) tokenisedText <- gsub("-ĀVAP-([iy])", "-āv ap\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("-(AP[IY])-", "\\L\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub(" as-MAT-", " asmat ", tokenisedText, perl=TRUE) tokenisedText <- gsub(" ā-SAM-([-| ])", " āsam\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("ap-YASM-a([dt])-", " apy asma\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("āś-AYA([MṂ])-", " -ĀŚAY-a\\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub(" i-HĀN-y", " ih-ĀNY-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-([iīāauū])?j([āa][śḥṃmn]?)([ |-])", "-\\1-J-\\2\\3", tokenisedText, perl=TRUE) tokenisedText <- gsub("-BHĀṢAMĀṆ-", "-BHĀṢ-amāṇ", tokenisedText, perl=TRUE) tokenisedText <- gsub("--APITV-([aā])", "-api tv\\1", tokenisedText, perl=TRUE) # tokenisedText <- gsub("--AJÑ-", "-a-JÑ-", tokenisedText, perl=TRUE) tokenisedText <- gsub("āha", "-ĀH-a", tokenisedText, perl=TRUE) tokenisedText <- gsub(" cān-", " c^ān-", tokenisedText, perl=TRUE) tokenisedText <- gsub("eka", "-EK-a", tokenisedText, perl=TRUE) tokenisedText <- gsub("ek-Ā", "-EK--Ā", tokenisedText, perl=TRUE) tokenisedText <- gsub("hyasau", " hy asau ", tokenisedText, perl=TRUE) tokenisedText <- gsub(" hy([aāiīuūeo])", " hy \\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("(ai[vn]a[mṃñ]) ", "^\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("t-VAŚAK-t", "tv a-ŚAKT-", tokenisedText) tokenisedText <- gsub("-HETU([RŚ])-", "-HET-u\\L\\1 ", tokenisedText, perl=TRUE) #tokenisedText <- gsub("([ |-])hya-", "\\1hy a-", tokenisedText, perl=TRUE) tokenisedText <- gsub("hya-", "hy a-", tokenisedText, perl=TRUE) tokenisedText <- gsub("iha", " iha ", tokenisedText, perl=TRUE) tokenisedText <- gsub("īha", "^īha ", tokenisedText, perl=TRUE) tokenisedText <- gsub("-enaiv-([ĀAEO])", "-en^aiv-\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("(ev-[ĀAEO])", " \\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("yadap-([ĪE])", " yad ap-\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("yad([ei][vtd][āa][mṃñ])", " yad \\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("anek-", "an-EK--", tokenisedText, perl=TRUE) tokenisedText <- gsub("([-| ])ek-", "\\1-EK-", tokenisedText, perl=TRUE) # tokenisedText <- gsub("cemāni", " c^emāni ", tokenisedText, perl=TRUE) tokenisedText <- gsub("([ytk]en)ā", " \\1^ā", tokenisedText, perl=TRUE) tokenisedText <- gsub("-NĀST-([yi])", "n^āst\\1", tokenisedText) tokenisedText <- gsub("([aā]st[yi])(?!tv[āae])(.)", "\\1 \\2", tokenisedText, perl=TRUE) tokenisedText <- gsub("-([uūaāiī])śc([āa])", "-\\1ś c\\2", tokenisedText, perl=TRUE) tokenisedText <- gsub("-([uūaāiī])śc([āa])", "-\\1ś c\\2", tokenisedText, perl=TRUE) tokenisedText <- gsub("(-ā[mṃ])([ṣsś])-", "\\1\\2 -", tokenisedText) tokenisedText <- gsub("(-[āa][mṃ])ev-", "\\1 ev-", tokenisedText) tokenisedText <- gsub("-āmim([eā])", "-ām im\\1", tokenisedText) tokenisedText <- gsub("([cv])ās([tm][iy])", "\\1^ās\\2 ", tokenisedText) # tokenisedText <- gsub("(-[āa][mṃ])(?![āa]n? |ā[nṇ][āa][ṃmñ]?|[aā]n-Ā)([abcdghlmnprtuvāūṛḷṇḍḥṭḍ-])", "\\1 \\2", tokenisedText, perl=TRUE)# I had only long ā; check if a causes problems tokenisedText <- gsub("(-ā[mṃ]) ([ṣsś]) ", "\\1\\2 ", tokenisedText) tokenisedText <- gsub("-aira-", "-air a-", tokenisedText) tokenisedText <- gsub("-(.?bhir)a-", "-\\1 a-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-yora-", "-yor a-", tokenisedText) tokenisedText <- gsub("-eṣva-", "-eṣv a-", tokenisedText) tokenisedText <- gsub("-ānāma-", "-ānām a-", tokenisedText) tokenisedText <- gsub("-āda-", "-ād a-", tokenisedText) tokenisedText <- gsub("-ānya-", "-āny a-", tokenisedText) tokenisedText <- gsub("-(y?an?ty)a-", "-\\1 a-", tokenisedText, perl=TRUE) tokenisedText <- gsub("-(ī[nṇ]ām)a-", "-\\1 a-", tokenisedText, perl=TRUE) tokenisedText <- gsub("(-ā[nd])(?![aā][mṃnñḥśs][ |-]|o[ |-])(aābcdeghlmnoprstuvūṛḷṇḍñṅḥśṣṭḍṃ)", "\\1 \\2", tokenisedText, perl=TRUE) tokenisedText <- gsub("(-ā[td])(?![aā][mṃnñḥśs][ |-]|o[ |-])([abcdghlmnoprstāūṛḷṇḍñṅḥśṣṭḍ-])", "\\1 \\2", tokenisedText, perl=TRUE) tokenisedText <- gsub("-āc-C", "-āc -C", tokenisedText) tokenisedText <- gsub("-āj-J", "-āj -J", tokenisedText) tokenisedText <- gsub("-āl-L", "-āl -L", tokenisedText) tokenisedText <- gsub("-ān-([MN])", "-ān -\\1", tokenisedText) tokenisedText <- gsub("-āda-", "-ād a-", tokenisedText) tokenisedText <- gsub("-([aā]n)(na[-| ])", "-\\1 \\2", tokenisedText) tokenisedText <- gsub("(-ena)([abcdghlnoprstuvṛḷṇḍñṅḥśṣṭḍ])", "\\1 \\2", tokenisedText) tokenisedText <- gsub("(-ānā[mṃ])([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1 \\2", tokenisedText) tokenisedText <- gsub("(-ai[rḥś])([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1 \\2", tokenisedText) tokenisedText <- gsub("(-asya)([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1 \\2", tokenisedText) tokenisedText <- gsub("(-[oei][ḥr])([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1 \\2", tokenisedText) tokenisedText <- gsub("-er (an|uḥ)", "-er\\1", tokenisedText) tokenisedText <- gsub("(-āni)([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1 \\2", tokenisedText) tokenisedText <- gsub("IṢYAT-([ieyu])", "-iṣyat\\1 \\2", tokenisedText) tokenisedText <- gsub("([^Ā])YAT-([iyev])(an?-)", "\\1-yat\\2 \\3", tokenisedText) tokenisedText <- gsub("AN-(t[iyv])(an?-)", "-an\\1 \\2", tokenisedText) #tokenisedText <- gsub("-(K([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ]){1,6}CI[TD])-", " \\L\\1 ", tokenisedText, perl=TRUE) tokenisedText <- gsub("IṢY-([āa]n?[mst][iy])", "-iṣy\\1", tokenisedText) tokenisedText <- gsub("IṢ-y([āa]n?[mst][iy])", "-iṣy\\1", tokenisedText) tokenisedText <- gsub("([yt]ac)ca", "\\1 ca", tokenisedText) tokenisedText <- gsub("eṣ(an?t[iy])", "-EṢ-\\1", tokenisedText) tokenisedText <- gsub("idañca", " -IDAÑ- ca ", tokenisedText) tokenisedText <- gsub("idañ ca", " -IDAÑ- ca ", tokenisedText) # tokenisedText <- gsub("-ād ([iy]) ", "-ād\\1 ", tokenisedText, perl=TRUE) ##tokenisedText <- gsub("(EVA[MṂ]?|SA[TDC]|AIVA?|[AĪ]YA[MṂ]|E?TA[TD]|[EI]DA[MṂ]|PUNA[SRḤ]|[AĀ]HA[MṂ]|KI[MṂ]|SMA|AP[IY]|[IĪ]HA|NI[RḤŚṢ]|[YT]ĀVA[TD]|KHALV|KATHA[MṂ]|[YT]ATHĀ|ASMI[MNṂ][MNṂ]?|[KYT]?ASMĀ[TD]|[KYT]?ASMAI)-([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])","\\1 \\2", tokenisedText) # the following transformation attempts to standardise the stemming of passives and causatives. First person pres passive is untractable with this method due to ambiguity with locative sing of -a declension tokenisedText <- gsub("(A)?Y-([aā]m[yi][ |-]|[aā]s[ie][ |-]|[aā]n?t[iyeu][ |-]|ann?[ |-]|et[ea][ |-]|[aā]mān[auāo][ |-]|.yu[ḥr][ |-]|[īe]ran[ |-]|.mas[ |-]|.[vm]ah[ei][ |-]|.dhve[ |-]|.dhva[mṃ][ |-])", "-\\L\\1y\\2", tokenisedText, perl=TRUE) tokenisedText <- gsub("-((?!J)[A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])-", "\\L\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub(' % ', '"', tokenisedText) tokenisedText <- gsub('\\^\\^', '\\^', tokenisedText) tokenisedText <- gsub("([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "\\1-\\2", tokenisedText) tokenisedText <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1-\\2", tokenisedText) tokenisedText <- gsub("([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1\\2", tokenisedText) tokenisedText <- gsub(" ([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "-\\1", tokenisedText) tokenisedText <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ]) ", "\\1-", tokenisedText) tokenisedText <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])-Ā", "\\1--Ā", tokenisedText, perl=TRUE) tokenisedText <- gsub(" -([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", " \\1", tokenisedText) tokenisedText <- gsub("---", "--", tokenisedText) tokenisedText <- gsub(" --", " -", tokenisedText) tokenisedText <- gsub("-- ", "- ", tokenisedText) tokenisedText <- gsub("-- ", "- ", tokenisedText) tokenisedText <- gsub("-- ", "- ", tokenisedText) tokenisedText <- gsub(" ([,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)]) ", "\\1 ", tokenisedText) tokenisedText <- gsub("--([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "-\\1", tokenisedText) tokenisedText <- gsub('-=', ' =', tokenisedText) tokenisedText <- gsub('- =', ' =', tokenisedText) tokenisedText <- gsub(' ([āaiouūī][ḥśsrṃm]?) ', '\\1 ', tokenisedText) tokenisedText <- gsub(' aḥ', 'aḥ', tokenisedText) tokenisedText <- gsub(' o', 'o', tokenisedText) tokenisedText <- gsub(' añ c', ' -AÑC-', tokenisedText) tokenisedText <- gsub(" "," ", tokenisedText) tokenisedText <- gsub("(<.*?>)", "\\L\\1", tokenisedText, perl=TRUE) tokenisedText <- gsub("doc-genre", "doc genre", tokenisedText) tokenisedText <- gsub("page-number", "page number", tokenisedText) write(tokenisedText, file= paste0(TextName,"_Final.txt")) return(tokenisedText) } TransformForGramrelsAndHumans <- function(SegmentedText,TextName){ #TextName <- deparse(substitute(SegmentedText)) TextResegmented <- gsub('%', '"', SegmentedText) TextResegmented <- gsub(" "," ", TextResegmented) TextResegmented <- gsub("(<.*?>)", "\\L\\1", TextResegmented, perl=TRUE) write(TextResegmented, file= paste0(TextName,"_withXML.txt")) TextforGramrels <- SegmentedText TextforGramrels <- gsub("-(en|[aā]sy)ā-","-\\1a^ a-", TextforGramrels) TextforGramrels <- gsub("-(en|[aā]sy)-O","-\\1a^ -U", TextforGramrels) TextforGramrels <- gsub("-([aāuūiī]m|[ie]r|.?nām|eṣām|[āū]?ny|av|.?yām|āsv|[ieu]ṣv|.?yor|.?bhir|.?bhyām)a-", "-\\1 a-",TextforGramrels) TextforGramrels <- gsub("(-[YKT]ASMĀ[TD]|-EVA[MṂÑ]?|-[SṢ]A[TDC]|-AIVA?|-PUNA[SḤR]|-E?TA[TDNM]|-AITA[TDN]|-[AĀ]NTAR|-[ṢS]A[DḌT]|-SVA|-ĀYU[ḤSṢR]|-[IĪE]DA[MṂÑ]|-V?AYA[MṂÑ]|-E[VD]A[MṂÑ]?|-AIVA[MṂÑ]]|-[AĀ]HA[MṂÑ]|-K..?.?.?.?.?CI[TDN]|-KHAL[UV]|-IM[AĀ]M[MṂÑ]|-[IAĀ]YA[MṂÑ]|-AVOCA[TD]|-VĀ[KṄG]|-[TKY]?ASMI[ṂMN]N?|-[YKT]ATHĀ|-KATHA[MṂÑ]|-YATHĀVA[TDCN]|-ŚR[IĪ]|-JAGAT|-CATU[RḤŚSṢ]|-E?[YT]ĀVA[DTCN]|-PṚTHA[KG]|-PRĀ[GK]|-[AĀ]P[IY]|-DṚG|-SĀKṢĀ[TDN]|-PAŚCĀ[TDN]|-MAHA[TD]|-ARHA[TN]|-PARṢAD|-SA[MṂ]PA[TD]|-DHI[KG]|-BH[UŪ][TD]|-EVA[MṂ]?|-SA[DC]|-AIVA?|-[AĪ]YA[MṂ]|-E?TA[TD]|-[EI]DA[MṂ]|-PUNA[SRḤ]|-[AĀ]HA[MṂ]|-KI[MṂ]|-SMA|-AP[IY]|-[IĪ]HA|-NI[RḤŚṢ]|-KHALV|-KATHA[MṂ]|-[YT]ATHĀ|-[KYT]?ASMI[MNṂ][MNṂ]?|-[KYT]?ASMĀ[TD]|-[KYT]?ASMAI|-PṚTHA[KG]|-DṚG)-([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])","\\1 \\2", TextforGramrels) TextforGramrels <- gsub("-TAT ([ou]|a[sśḥ])"," tat\\1 ", TextforGramrels) TextforGramrels <- gsub("-([ST])AD ([āa]) "," \\L\\1ad\\2 ", TextforGramrels, perl=TRUE) TextforGramrels <- gsub("-NIŚ (ā[mṃ]?|ās[uv]|āyā[mṃḥsś]|ayā|āyai)","-NIŚ -\\1", TextforGramrels) #TextforGramrels <- gsub("(<.*?>)", "\\L\\1", TextforGramrels, perl=TRUE) #TextforGramrels <- gsub("([-| ])([sd][uū][ḥsr]?)-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "\\1\\2 @ \\3", TextforGramrels) #TextforGramrels <- gsub("([-| ])([ni][ḥsṣr])-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "\\1\\2 @ \\3", TextforGramrels) #TextforGramrels <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])-(.)?-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "\\1 -\\2 @ \\3", TextforGramrels) # for compounds where two words are separated by single letter: -BUDDH-a-DHARMA-a TextforGramrels <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])-([aiāīgkṅyo])-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "\\1 -\\2 @ \\3", TextforGramrels) # for compounds where two words are separated by single letter: -BUDDH-a-DHARMA-a TextforGramrels <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])-(ika|aka|ya|[īie]ta|a?[gkñk])?-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "\\1 -\\2 @ \\3", TextforGramrels) # for compounds with multiple letters intervening between the two compounded lemmas TextforGramrels <- gsub(" (an?)-([A-Z]|[A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", " \\1 @ \\2", TextforGramrels) # alpha/an privativum are annotated as in a compound with following lemma TextforGramrels <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])--([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "\\1 @ \\2", TextforGramrels) # compounds with no intervening lowercase : -SATTV--ĀŚAY-a TextforGramrels <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])-([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1 -\\2", TextforGramrels) # lemma + morphological ending TextforGramrels <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])-", "\\1 ", TextforGramrels) TextforGramrels <- gsub("-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", " \\1", TextforGramrels) TextforGramrels <- gsub("([AĪ]YA[MṂ]|ETA[TD]|[EI]DA[MṂ]|[AĀ]HA[MṂ]|KI[MṂ]|SMA|AP[IY]|[IĪ]HA|NI[RḤŚṢ]|[YT]ĀVA[TD]|KHALV|KATHA[MṂ]|[YT]ATHĀ|ASMI[MNṂ][MNṂ]?|[KYT]?ASMĀ[TD]|[KYT]?ASMAI|[YKT]ASMĀ[TD]|EVA[MṂÑ]?|[SṢ]A[TDC]|AIVA?|PUNA[SḤR]|E?TA[TDNM]|AITA[TDN]|[AĀ]NTAR|[ṢS]A[DḌT]|SVA|ĀYU[ḤSṢR]|[IĪE]DA[MṂÑ]|V?AYA[MṂÑ]|E[VD]A[MṂÑ]?|AIVA[MṂÑ]]|[AĀ]HA[MṂÑ]|K..?.?.?.?.?CI[TDN]|KHAL[UV]|IM[AĀ]M[MṂÑ]|[IAĀ]YA[MṂÑ]|AVOCA[TD]|VĀ[KṄG]|[TKY]?ASMI[ṂMN]N?|[YKT]ATHĀ|KATHA[MṂÑ]|YATHĀVA[TDCN]|ŚR[IĪ]|JAGAT|CATU[RḤŚSṢ]|E?[YT]ĀVA[DTCN]|PṚTHA[KG]|PRĀ[GK]|[AĀ]P[IY]|DṚG|SĀKṢĀ[TDN]|PAŚCĀ[TDN]|MAHA[TD]|ARHA[TN]|PARṢAD|SA[MṂ]PA[TD]|DHI[KG]|BH[UŪ][TD]|EVA[MṂ]?|SA[DC]|E?TA[TD]|PUNA[SRḤ]|[KYT]?ASMI[MNṂ][MNṂ]?) - @","\\1 ", TextforGramrels) TextforGramrels <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ]) -(du[ḥśsṣ]|su) ([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])","\\1 @ \\2 @ \\3", TextforGramrels) TextforGramrels <- gsub("-(.?)(du[ḥśsṣ]) ([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])","-\\1 @ \\2 \\3", TextforGramrels) TextforGramrels <- gsub("(du[ḥśsṣ]|su) ([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])","\\1 @ \\2", TextforGramrels) TextforGramrels <- gsub(" "," ", TextforGramrels) TextforGramrels <- gsub('%','"', TextforGramrels) TextforGramrels <- gsub("(<.*?>)", "\\L\\1", TextforGramrels, ignore.case=FALSE, perl=TRUE) write(TextforGramrels, file= paste0(TextName,"_ForGramrels.txt")) TextForUniversalTags <- TextforGramrels TextForUniversalTags <- gsub("@", "", TextForUniversalTags) TextForUniversalTags <- gsub("(<.*?>)", "\\U\\1", TextforGramrels, perl=TRUE) TextForUniversalTags <- gsub("-?([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "", TextForUniversalTags) #TextForUniversalTags <- gsub("([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "", TextForUniversalTags) TextForUniversalTags <- gsub("\\s+", " ", TextForUniversalTags) TextForUniversalTags <- gsub("(<.*?>)", "\\L\\1", TextforGramrels, perl=TRUE) write(TextForUniversalTags, file= paste0(TextName,"_ForUniversalTags.txt")) HumanReadabletext <- TextforGramrels #print(TextforGramrels) HumanReadabletext <- gsub("\\s+-([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])", "\\1", HumanReadabletext) HumanReadabletext <- gsub(" ([ncvs]) ([Ā|O|E|AI|AU])", " \\1\\^\\2", HumanReadabletext) HumanReadabletext <- gsub(" ([tns]) ([Ū|O])", " \\1\\^\\2", HumanReadabletext) HumanReadabletext <- gsub(" - @ ([Ā|O|E|AI|AU])", "\\^\\1", HumanReadabletext) HumanReadabletext <- gsub(" - @ ", "-", HumanReadabletext) HumanReadabletext <- gsub(" @ ", "-", HumanReadabletext) HumanReadabletext <- tolower(HumanReadabletext) write(HumanReadabletext, file= paste0(TextName,"_ForHumans.txt")) TextForEvaluation <- HumanReadabletext TextForEvaluation <- gsub("-", " - ", TextForEvaluation) TextForEvaluation <- gsub("\\^", " \\^ ", TextForEvaluation) write(TextForEvaluation, file= paste0(TextName,"_ForEvaluation.txt")) return(TextforGramrels) } #### functions for resegmentation DataFrameTokens_CAPS <- function(Cleaned_Segmented_Text,TextName){ #TextName <- deparse(substitute(Cleaned_Segmented_Text)) Text_TokensDF <- as.data.frame(table(unlist(str_extract_all(Cleaned_Segmented_Text,"-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+-")))) #write.table(Text_TokensDF,file= paste(TextName,"TokensDF.csv", sep=""), quote=F,sep=",",row.names=F) return(Text_TokensDF) } PseudoTokenizer1 <- function(StreamlidedNgrams_WithCorpusFreq, NgrammedText){ #StreamlidedNgrams_WithCorpusFreq is a DF with the ngrams+freq of the text to be tokenised # + the freq of those ngrams in the wider context of the text to be tokenised # (e.g if you tokenise a paragraph it will be the chapter it comes from) StreamlidedNgrams_WithCorpusFreq <- StreamlidedNgrams_WithCorpusFreq[order(nchar(as.character(StreamlidedNgrams_WithCorpusFreq$ngrams)),decreasing = TRUE), ] #print(StreamlidedNgrams_WithCorpusFreq) StreamlidedNgrams_WithCorpusFreq$nchar <- nchar(as.character(StreamlidedNgrams_WithCorpusFreq$ngrams)) # prioritises longer ngrams StreamlidedNgrams_WithCorpusFreq$nchar <- factor(StreamlidedNgrams_WithCorpusFreq$nchar) for (i in levels(StreamlidedNgrams_WithCorpusFreq$nchar)){ # FOR ngrams of equal length SameNcharDF <- StreamlidedNgrams_WithCorpusFreq[StreamlidedNgrams_WithCorpusFreq$nchar== i ,] SameNcharDF <- SameNcharDF[order(nchar(as.character(SameNcharDF$ngrams)),decreasing = TRUE), ] if (nrow(StreamlidedNgrams_WithCorpusFreq[StreamlidedNgrams_WithCorpusFreq$nchar== i ,]) >1){ # equal nchar and differen freq => prioritise higher freq OrderedSameNcharDF <- SameNcharDF[order(SameNcharDF$WeightedFreq, decreasing = TRUE),] SameNcharDF <- OrderedSameNcharDF #print("First For Loop: in this iteration i and SameNchar are: ") #print(i) #print(SameNcharDF) } for (f in SameNcharDF$WeightedFreq){ #for ngrams of eqaul length AND equal freq in the text to be tokenized SameNcharDF <- SameNcharDF[order(nchar(as.character(SameNcharDF$ngrams)),decreasing = TRUE), ] if (nrow(SameNcharDF[SameNcharDF$WeightedFreq == f ,]) >1) { OrderedSameNcharDF <- SameNcharDF[order(SameNcharDF$WeightedFreq2, decreasing = TRUE),] SameNcharDF <- OrderedSameNcharDF } } StreamlidedNgrams_WithCorpusFreq <- rbind(SameNcharDF, StreamlidedNgrams_WithCorpusFreq) StreamlidedNgrams_WithCorpusFreq <- unique(StreamlidedNgrams_WithCorpusFreq) #print(SameNcharDF) } #print("** these are the Ngrams used for tokenizing the text **") #print(StreamlidedNgrams_WithCorpusFreq) #StreamlidedNgrams_WithCorpusFreq <- StreamlidedNgrams_WithCorpusFreq[StreamlidedNgrams_WithCorpusFreq$contextFreq > 0,] #Text <- NgrammedText Text <- gsub("(<.*?>)", "\\U\\1", NgrammedText, ignore.case=FALSE, perl=TRUE) for (i in StreamlidedNgrams_WithCorpusFreq$ngrams){ TokenisedText <- str_replace_all(Text, i, paste0("-" ,str_to_upper(i), "-")) #print(i) #print(TokenisedText) # TokenisedText <- str_replace_all(Text, i, paste0("-",i,"-")) Text <- TokenisedText #print(Text) } ## TextName = deparse(substitute(NgrammedText)) #write(Text, file= paste0(TextName,"_Tokenized1.txt")) # # system("say Segmented") return(Text) } ## USE THIS Resegment_Initial_Flagged <- function(StreamlidedNgrams_WithCorpusFreq, Text_Tokenized_Cleaned,TextName){ #TextName = deparse(substitute(Text_Tokenized_Cleaned)) InterSpacesFlagger_Initial <- function(TokenizedText){ FlagSingleLowercase1 <- unlist(regmatches(TokenizedText, gregexpr("[\\n|\\s](?!a)([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+--?([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])*?-?([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])*?[-| |\\||/|,|;|:|\\.]", TokenizedText, perl=TRUE))) FlagLowercase1 <- unlist(regmatches(TokenizedText, gregexpr("[\\n|\\s]((?!ya[td]|[yt]en[āa]?|ast[iyu]|an|it[iy]|syā[dt]|asm[iy]|h[iy]|[ts][uv]|[cvstmn]a|[kyt]?asya?|[kyt]eṣā[mṃñ]|so|[ty]ān[iy]|[yt]athā?|ni[ṣḥrś]|ih|ev|ek|iya[ṃṃ]|.?ebhi[rḥś]|[ykt]eṣā[mṃñ]|ki[mṃñ]|tatr|eva?[mṃ]?|[kty]asma[iy]|[kty]asmā.?|[kty]asmi[nmṃ][nmṃ]?|[tky]ena?|c[ie][dtj])[a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ]){2,7}-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+--?([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])*?-?([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])*?[-| |\\||/|,|;|:|\\.]", TokenizedText, perl=TRUE))) LowercaseInterspace<- unlist(regmatches(TokenizedText, gregexpr("[\\n|\\s]((?![tky]?asya|[tky]?eṣā[mṃ]?|[tky]ena|[tky]eṣ[uv]|n?[āa]sau|imau|imān[iy]|etā[mṃ]|atha|[ty]athā|[tys]adā|syā[td]|as[mt][iuy]|^|yad[iy]|sace[td]|[ia]tas|yad[iy]|cāsya|tāmeva|yacca)[a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ]){4,10}[ |\\||/|,|;|:|\\.]", TokenizedText, perl=TRUE))) ExcludeFlagSingleLowercase1 <- FlagSingleLowercase1[grep("[cns]-[Ā|A|O|E|AI|AU]|v-[Ā|O|E|AI|AU]|t-[UŪ]|h-[ĪE]|h-[YĪ]|e-TASM", FlagSingleLowercase1)] FlagSingleLowercase1 <- FlagSingleLowercase1[!FlagSingleLowercase1 %in% ExcludeFlagSingleLowercase1] AllFLagged <- c(FlagSingleLowercase1,FlagLowercase1,LowercaseInterspace) # TextName <- deparse(substitute(TokenizedText)) #write.table(as.data.frame(AllFLagged),file= paste(TextName, "Flagged.csv", sep=""), quote=F, sep=",", row.names=F) ## system("say flagged") return(AllFLagged) } # slice text in small chunks for faster processing ChunkText <- function(x, chunk_size = 100, doc_id = names(x), ...) { chunk_individual_text <- function(x, chunk_size, doc_id, ...) { stopifnot(is.character(x), length(x) == 1) words <- tokenize_regex(x, pattern = "\\s+", simplify = TRUE) if(length(words) <= chunk_size) { chunks <- x } chunks <- split(words, ceiling(seq_along(words)/chunk_size)) if (!is.null(doc_id)) { num_chars <- stringi::stri_length(length(chunks)) chunk_ids <- stringi::stri_pad_left(seq(length(chunks)), width = num_chars, pad = "0") names(chunks) <- stringi::stri_c(doc_id, chunk_ids, sep = "-") } else { names(chunks) <- NULL } out <- lapply(chunks, stringi::stri_c, collapse = " ") out } # check_input(x) stopifnot(chunk_size > 1) if (is.character(x) && length(x) == 1) { out <- chunk_individual_text(x = x, chunk_size = chunk_size, doc_id = doc_id, ...) } else { out <- lapply(seq_along(x), function(i) { chunk_individual_text(x = x[[i]], chunk_size = chunk_size, doc_id = doc_id[[i]], ...) }) out <- unlist(out, recursive = FALSE, use.names = TRUE) } out } chunkedTextList <- ChunkText(Text_Tokenized_Cleaned, chunk_size = 200, lowercase = FALSE, strip_punct = FALSE) chunkedText <- unlist(chunkedTextList) # gather flagged tokens from each chunk AllFlagged <- c() for (chunk in chunkedText){ # extract space-separated strings and apply flagger ot each of them SpaceTokenised <- tokenize_regex(chunk, pattern="\\s+") #SentenceFlagged <- c() for (i in unlist(SpaceTokenised)){ iSpaced <- paste0(" ", i, " ") Flagged <- InterSpacesFlagger_Initial(iSpaced) if (length(Flagged) > 0 ){ FlaggedTokens <- Flagged[order(nchar(as.character(Flagged)),decreasing = TRUE)] for (i in FlaggedTokens){ iLower <- tolower(i) iNoDash <- gsub("[ |-]", "",iLower) #print(iNoDash) if (length(grep("^[cnvs][-|āeo|ai|au]", iNoDash)==1)==FALSE){ #print(iNoDash) Start_iNoDash <- unlist(regmatches(iNoDash, gregexpr("^...",iNoDash))) MatchesDF <- StreamlidedNgrams_WithCorpusFreq[grep(paste0("^",Start_iNoDash), StreamlidedNgrams_WithCorpusFreq$ngrams),] #OrderMatches <- MatchesDF[order(MatchesDF$CleanFreq, decreasing = TRUE),] OrderMatches <- MatchesDF[order(nchar(as.character(MatchesDF$ngrams)), decreasing = TRUE),] #print(OrderMatches) possibleMatches <- as.character(OrderMatches$ngrams) for (p in possibleMatches){ if (length(grep(p, paste0("^",iNoDash)))==1){ iNoDash <- str_replace(iNoDash, p, paste0("-" ,str_to_upper(p), "-")) ResegmentediNoDash <- gsub(iNoDash, paste0(" ", iNoDash, " "), iNoDash) if(length(InterSpacesFlagger_Initial(iNoDash))==0){ # print(paste0("this is iNoDash: ", iNoDash)) # print(paste0("ResegmentediNoDash: ", ResegmentediNoDash)) if(length(SlimFlagger(ResegmentediNoDash)) > 0){ ResegString <- PseudoTokenizer1(StreamlidedNgrams_WithCorpusFreq, ResegmentediNoDash) # print(ResegString) Text_Tokenized_Cleaned <- gsub(i,ResegString, Text_Tokenized_Cleaned) } else{ Text_Tokenized_Cleaned <- gsub(i,ResegmentediNoDash, Text_Tokenized_Cleaned) } } } } } } } } #chunk <- UnsegmentFlagged(SentenceFlagged, chunk) } # write(Text_Tokenized_Cleaned, file=paste0(TextName, "_ReSegInit.txt")) return(Text_Tokenized_Cleaned) } Flagger <- function(TokenizedText){ TokenizedText <- str_remove_all(TokenizedText,"[,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)|/]") FlagSingleLowercase1 <- unlist(regmatches(TokenizedText, gregexpr("[\\n|\\s](?!a)([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+--?([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])*?-?([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])*?[ |\\||/|,|;|:|\\.]", TokenizedText, perl=TRUE))) FlagSingleLowercase2 <- unlist(regmatches(TokenizedText, gregexpr("-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+-((?![aāiīyeuov])[a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+-([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])*?[ |\\||/|,|;|:|\\.]", TokenizedText, perl=TRUE))) # FlagDoubleLowercase2 <- unlist(regmatches(TokenizedText, gregexpr("[\\n|\\s](?![csn]a|[cvn]ā|an|[ts]u|h[iy])([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+--?([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])*?-?([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])*?[ |\\||/|,|;|:|\\.]", TokenizedText, perl=TRUE))) FlagDoubleLowercase1 <- unlist(regmatches(TokenizedText, gregexpr("-((?![csn]a|[cvn]ā|an|[ts]u|h[iy]|[aāiīeouū][ḥrśs]|[aāiīuū][mṃnñ]|[āe][td]|y[aā])[a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+--?([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])*?-?([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])*?[ |\\||/|,|;|:|\\.]", TokenizedText, perl=TRUE))) FlagDoubleLowercase3 <-unlist(regmatches(TokenizedText, gregexpr("-((?![csn]a|[cvn]ā|an|[ts]u|h[iy]|[aāiīeouū][ḥrśs]|[aāiīuū][mṃnñ]|[āe][td]|y[aā])[a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+--?([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])-?([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])*?-?([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])*?[ |\\||/|,|;|:|\\.]", TokenizedText, perl=TRUE))) FlagLowercase1 <- unlist(regmatches(TokenizedText, gregexpr("[\\n|\\s]([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ]){3,7}-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+--?([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])*?-?([a-z|āīūṛḷṇḍñṅḥśṣṭḍṃ])*?[ |\\||/|,|;|:|\\.]", TokenizedText, perl=TRUE))) FlagLowercase2 <- unlist(regmatches(TokenizedText, gregexpr("-([A-Z|ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+-((?!ai[rḥśs]|anta[mṃḥsś]|nān|[ūā][nṇ][iy]|[eiāū][sṣ][uv]|[īeāi]t[iye]|syā[dt]|.?asm[iy].?.?|[ty]ān[iy]|[yt]athā?|ni[ṣḥrś]|eva?|(? 1) if (is.character(x) && length(x) == 1) { out <- chunk_individual_text(x = x, chunk_size = chunk_size, doc_id = doc_id, ...) } else { out <- lapply(seq_along(x), function(i) { chunk_individual_text(x = x[[i]], chunk_size = chunk_size, doc_id = doc_id[[i]], ...) }) out <- unlist(out, recursive = FALSE, use.names = TRUE) } out } chunkedTextList <- ChunkText(Text_Tokenized_Cleaned, chunk_size = 200, lowercase = FALSE, strip_punct = FALSE) chunkedText <- unlist(chunkedTextList) # gather flagged tokens from each chunk AllFlagged <- list() Text_Tokenized_Cleaned <- gsub("(-āj-J|-āl-L|-āc-C)", "\\U\\1", Text_Tokenized_Cleaned, perl=TRUE) # this avoids flagging sandhi abl for (chunk in chunkedText){ # extract space-separated strings and apply flagger ot each of them SpaceTokenised <- tokenize_regex(chunk, pattern="\\s+") #SentenceFlagged <- c() #print(paste0("this is SpaceTokenised: ", SpaceTokenised)) for (i in unlist(SpaceTokenised)){ # print(paste0("this i in Spacetokenised: ", i)) i_Spaced <- paste0(" ", i, " ") Flagged <- SlimFlagger(i_Spaced) if (length(Flagged) > 0 ){ #if (length(Flagged) > 1) { # FlaggedByNchar <- Flagged[order(nchar(Flagged), decreasing = TRUE)] #Flagged <- FlaggedByNchar[1] #} AllFlagged[[i]] <- Flagged } } } UniqueFlags <- unique(AllFlagged) return(UniqueFlags) } FlagReducer <- function(FlagList){ ReducedFlags <- c() for (z in FlagList){ # print(z) # print(paste0("This is length(z): ", length(z))) if(length(z)>1){ zVec <- unlist(z) # print(zVec) zVec <- str_remove_all(zVec, " ") #zVec<- str_remove_all(zVec, "-") zVec <- unique(zVec) # print("this is unique zVec") # print(zVec) # transformation to avoid substrings & duplicated items that only differ by non-alphanumeric qVec<- str_remove_all(zVec, "-") qVec <- paste(qVec[1], qVec[seq_along(qVec)]) qVec <- qVec[-1] # print(paste0("This is zVec : " , zVec)) # print(paste0("This is qVec : " , qVec)) # # if (length(z) > 1){ for(i in zVec){ a <- str_remove_all(i, " ") a <- str_remove_all(a, "-") #print(paste0("this is a: ", a)) FlagGrepped <- unlist(regmatches(qVec, gregexpr(a, qVec))) # print(FlagGrepped) # print(length(FlagGrepped)) if(length(FlagGrepped) < 2){ ReducedFlags[[i]] <- i } } }else{ for(i in z){ ReducedFlags[[i]] <- i } } } return(ReducedFlags) } GetFlagFreq <- function(FlaggedList, TOKENS_DF, Text_Tokenized_Cleaned,TextName){ #TextName <- deparse(substitute(Text_Tokenized_Cleaned)) FlagTokensRedux <- c() for (z in FlaggedList){ # print(paste0("This is length(z): ", length(z))) zVec <- unlist(z) zVec <- paste(zVec) #print(paste0("This is length(zVec): ", length(zVec))) #if (length(z) > 1){ for(i in zVec){ #print(i) Tokens <- str_extract_all(i,"-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+-") FlagTok <- unique(Tokens) FlagTokensRedux[[i]] <- FlagTok FlagTokensRedux <- unique(paste(unlist(FlagTokensRedux))) } } #FlaggedVec <- unique(FlagTokensRedux) # FlaggedVec <- unlist(FlagTokensRedux) # FlaggedVec <-paste(FlaggedVec) FlagPerToken <- as.data.frame(table(unlist(regmatches(FlagTokensRedux, gregexpr("-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+-", FlagTokensRedux))))) # if (nrow(FlagPerToken)>0 & length(FlagPerToken)==2){ colnames(FlagPerToken) <- c("Var1", "FlaggedFreq") FlagPerToken$Var1 <- str_remove_all(FlagPerToken$Var1,"-") TOKENS_DF$Var1 <- str_remove_all(TOKENS_DF$Var1,"-") #print(FlagPerToken) #TOKENS_DF$Var1 <- gsub("-","",TOKENS_DF$Var1) # print("this is TOKENS_DF") # print(head(TOKENS_DF)) # print("this is tail TOKENS_DF") # print(tail(TOKENS_DF)) # print("this is FlagPerToken") # print(head(FlagPerToken)) # print("head TOKENS_DF:") # head(TOKENS_DF) # print("head FlagPerToken:") # head(FlagPerToken) TOKENS_DF_WithFLagFreq <- full_join(TOKENS_DF, FlagPerToken, by = "Var1") TOKENS_DF_WithFLagFreq[is.na(TOKENS_DF_WithFLagFreq)] <- 0 TOKENS_DF_WithFLagFreq <- mutate(TOKENS_DF_WithFLagFreq, CleanFreq = Freq-FlaggedFreq) #TOKENS_DF_WithFLagFreq$Var1 <- tolower(TOKENS_DF_WithFLagFreq$Var1) #print(tail(TOKENS_DF_WithFLagFreq)) write.table(as.data.frame(TOKENS_DF_WithFLagFreq),file= paste(TextName,"TokensDFWithCleanFreq.csv", sep=""), quote=F,sep=",",row.names=F) # system("say Process completed") return(TOKENS_DF_WithFLagFreq) } # else{ # CleanedText <- FinalCleaner(Text_Tokenized_Cleaned,TextName) # print("Could NOT resegment. An earlier version of segemntation has been cleaned and written to file") # } #} # takes as first arg the output of SlimFlagger UnsegmentFlags <- function(FlaggedVec){ FlaggedVec <- gsub("-", "",FlaggedVec) FlaggedVec <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])", "\\L\\1",FlaggedVec,perl=TRUE) return(FlaggedVec) } Resegmenter <- function(ToResegmentDF, Text_Tokenized_Cleaned, TextName){ #TextName = deparse(substitute(Text_Tokenized_Cleaned)) ResegmentedText <- Text_Tokenized_Cleaned row=1 while (row <= nrow(ToResegmentDF)){ # print("ToResegmentDF[row,c(2)]") # print(paste0("START",ToResegmentDF[row,c(2)], "END")) ResegmentedText <- gsub(ToResegmentDF[row,4], ToResegmentDF[row,2], ResegmentedText) row=row+1 } # print(ResegmentedText) # write(ResegmentedText, file= paste0(TextName,"_Resegmented.txt")) return(ResegmentedText) } Resolver <- function(ResegmentedVec){ Resolved <-c() StillFlagged <-c() #print(paste0("this is ResegmentedVec", ResegmentedVec, "from Resolver")) for (i in ResegmentedVec){ # print(paste0("this is ", i, "from Resolver")) i <- gsub(i, paste0(" ", i, " "), i) f <- SlimFlagger(i) #print(paste0("this is i: ",i, " is it not flagged? ", length(f)<1)) #print(length(f)<1) if(length(f)<1){ Resolved[i]<- i }else{ StillFlagged[i]<- i # print(paste0(i, "put in StillFlagged")) } } if (length(Resolved)>0){ ResLow <- tolower(Resolved) ResLow <- gsub("-","", ResLow) ResDF <- data.frame(c(rep("Resolved", length(Resolved))), Resolved) ResDF <- data.frame(ResDF, ResLow) colnames(ResDF) <- c("Status", "Resegmented", "Unsegmented") } if (length(StillFlagged)>0){ #print(paste0("this is the length of StillFlagged after resolving: ",length(StillFlagged) )) StilFlagLow <- tolower(StillFlagged) print(StilFlagLow) StilFlagLow <- gsub("-","", StilFlagLow) FlagDF <- data.frame(c(rep("StillFlagged", length(StillFlagged))), StillFlagged) FlagDF <- data.frame(FlagDF, StilFlagLow) colnames(FlagDF) <- c("Status", "Resegmented", "Unsegmented") } if (length(Resolved)>0 & length(StillFlagged)>0){ ResegmentationDF <- rbind(ResDF, FlagDF) return(ResegmentationDF) }else{ if (length(Resolved)>0){ ResegmentationDF <- ResDF return(ResegmentationDF) }else{ ResegmentationDF <- FlagDF return(ResegmentationDF) } } } UniversalResegmenter <- function(UnsegmentedFlags_DF, NgramsDF,TextName){ #TextName <- deparse(substitute(Text_Tokenized_Cleaned)) Flag_Resegmenter <- function(NgramsDF, UnsegFlagVec){ TopFreq <- NgramsDF[order(NgramsDF$WeightedFreq, decreasing=TRUE),] TopFreq <- TopFreq$WeightedFreq[1] # identifies max frequency in a given corpus, this is used later on to order matches by both lenght and frequency: character numbers are multiplied by the highest possible frequency in the corpus and frequency is added to it, so that the longest words will be at the top and qords of equal length will be ordered ny frequency #TokenDF_WithFlagFreq <- TokenDF_WithFlagFreq[nchar(as.character(TokenDF_WithFlagFreq$Var1))>2,] #print(UnsegFlagVec) Matches <- c() for (i in NgramsDF$ngrams){ ## change to i in TokenDF_WithFlagFreq$Var1 when you have reliable tokenDF # print(i) if (length(grep(i, UnsegFlagVec))>0){ Matches[[i]] <- i # print(i) } } #print(Matches) # MatchesOrdered <- Matches[order(nchar(Matches), decreasing=TRUE)] # reverse alphabetical order, as many mistakes arise from words starting with A MatchesDF <- NgramsDF[NgramsDF$ngrams %in% Matches,] MatchesDF <- mutate(MatchesDF, "LenghtCumFreq" = WeightedFreq+(nchar(as.character(ngrams))*TopFreq) ) # print(MatchesDF) MatchesOrderedDF <- MatchesDF[order(MatchesDF$LenghtCumFreq, decreasing=TRUE),] MatchesOrdered <- MatchesOrderedDF$ngrams # print("this is MatchesOrdered:") # print(MatchesOrdered) # ResegFlagVec <- UnsegFlagVec start <- 1 if(length(MatchesOrdered)>1){ while ( start <= length(MatchesOrdered)){ for(i in MatchesOrdered[1:length(MatchesOrdered)]){ #print(i) ResegFlagVec <- str_replace_all(ResegFlagVec, i, paste0("-" ,str_to_upper(i), "-")) #print(start) #print(ResegFlagVec) } #print("end loop") ResegFlagVec_Spaced <- paste0(" ", ResegFlagVec, " ") if(length(SlimFlagger(ResegFlagVec_Spaced))==0){ return(ResegFlagVec) }else{ ResegFlagVec <- UnsegFlagVec #print(ResegFlagVec) start <- start+1 index <- c(length(MatchesOrdered), seq(1:(length(MatchesOrdered)-1))) MatchesOrdered <- MatchesOrdered[order(index)] # print(MatchesOrdered) } } } else if (length(MatchesOrdered)==1){ while ( start <= length(MatchesOrdered)){ for(i in MatchesOrdered[1:length(MatchesOrdered)]){ # print(i) ResegFlagVec <- str_replace_all(ResegFlagVec, i, paste0("-" ,str_to_upper(i), "-")) # print(start) # print(ResegFlagVec) } # print("end loop") ResegFlagVec_Spaced <- paste0(" ", ResegFlagVec, " ") if(length(SlimFlagger(ResegFlagVec_Spaced))==0){ return(ResegFlagVec) }else{ ResegFlagVec <- UnsegFlagVec #print(ResegFlagVec) start <- start+1 } } }#else{ # print(paste0("no matches for: ", i)) # } } #Resegmented <- Text_Tokenized_Cleaned UnsegmentedFlags_DF <- unique(UnsegmentedFlags_DF) Reseg <- c() # Resolved <- list() # StillFlagged <-c() for (i in UnsegmentedFlags_DF$Unsegmented){ #print(paste0("This is i from UniReseg: ", i)) i_ResegFreq <- Flag_Resegmenter(NgramsDF, i) #print(paste0("this is i: ",i, " and this is i_ResegFreq: ",i_ResegFreq)) if(!is.null(i_ResegFreq)){ Reseg[[i]] <- i_ResegFreq }#else{print(paste0(i," not resegmented")) # } } return(Reseg) } UniversalResegmenter_Tokens <- function(UnsegmentedFlags_DF, TokenDF_WithFlagFreq, Wordlist){ #Textname <- deparse(substitute(Text_Tokenized_Cleaned)) Flag_Resegmenter <- function(TokenDF_WithFlagFreq, UnsegFlagVec){ #print(UnsegFlagVec) Matches <- c() for (i in TokenDF_WithFlagFreq$Var1){ ## change to i in TokenDF_WithFlagFreq$Var1 when you have reliable tokenDF # print(i) if (length(grep(i, UnsegFlagVec))>0){ Matches[[i]] <- i # print(i) } } # print(Matches) MatchesOrdered <- Matches[order(nchar(Matches), decreasing=TRUE)] # reverse alphabetical order, as many mistakes arise from words starting with A #print("this is MatchesOrdered:") # print(MatchesOrdered) ResegFlagVec <- UnsegFlagVec start <- 1 if(length(MatchesOrdered)>1){ while ( start <= length(MatchesOrdered)){ for(i in MatchesOrdered[1:length(MatchesOrdered)]){ # print(i) ResegFlagVec <- str_replace_all(ResegFlagVec, i, paste0("-" ,str_to_upper(i), "-")) # print(start) # print(ResegFlagVec) } #print("end loop") ResegFlagVec_Spaced <- paste0(" ", ResegFlagVec, " ") if(length(SlimFlagger(ResegFlagVec_Spaced))==0){ return(ResegFlagVec) }else{ ResegFlagVec <- UnsegFlagVec # print(ResegFlagVec) start <- start+1 index <- c(length(MatchesOrdered), seq(1:(length(MatchesOrdered)-1))) MatchesOrdered <- MatchesOrdered[order(index)] # print(MatchesOrdered) } } } else{ while ( start <= length(MatchesOrdered)){ for(i in MatchesOrdered[1:length(MatchesOrdered)]){ # print(i) ResegFlagVec <- str_replace_all(ResegFlagVec, i, paste0("-" ,str_to_upper(i), "-")) # print(start) # print(ResegFlagVec) } # print("end loop") ResegFlagVec_Spaced <- paste0(" ", ResegFlagVec, " ") if(length(SlimFlagger(ResegFlagVec_Spaced))==0){ return(ResegFlagVec) }else{ ResegFlagVec <- UnsegFlagVec #print(ResegFlagVec) start <- start+1 } } } } #Resegmented <- Text_Tokenized_Cleaned UnsegmentedFlags_DF <- unique(UnsegmentedFlags_DF) Reseg <- c() # Resolved <- list() # StillFlagged <-c() for (i in UnsegmentedFlags_DF$Unsegmented){ # print(paste0("This is i from UniReseg: ", i)) i_ResegFreq <- Flag_Resegmenter(TokenDF_WithFlagFreq, i) #print(paste0("this is i: ",i, " and this is i_ResegFreq: ",i_ResegFreq)) if(!is.null(i_ResegFreq)){ Reseg[[i]] <- i_ResegFreq }#else{print(paste0(i," not resegmented")) #} } return(Reseg) } Resegment_Universal_Ngrams <- function(UnsegmentedFlags_DF, NgramsDF, Text_Tokenized_Cleaned,TextName){ #TextName <- deparse(substitute(Text_Tokenized_Cleaned)) #Resolved <- data.frame(Status= character(0), Resegmented= character(0), Unsegmented = character(0) ) # print("this is initialised Resolved:") # print(Resolved) UnsegmentedFlags_DF <- unique(UnsegmentedFlags_DF) ResegmentedStrings <- UniversalResegmenter(UnsegmentedFlags_DF, NgramsDF) if(!is.null(ResegmentedStrings)){ Resegmented_DF <- Resolver(ResegmentedStrings) # print(Resegmented_DF) Resegmented_DF$Unsegmented <- str_remove_all(Resegmented_DF$Unsegmented," ") UnsegmentedFlags_DF$Unsegmented <- str_remove_all(UnsegmentedFlags_DF$Unsegmented," ") Resegmented_DF$Resegmented <- str_remove_all(Resegmented_DF$Resegmented," ") ResegAndOrig_DF <- left_join(Resegmented_DF, UnsegmentedFlags_DF, by = "Unsegmented") print(ResegAndOrig_DF) ResegAndOrig_DF <- ResegAndOrig_DF[order(nchar(as.character(ResegAndOrig_DF$Unsegmented)), decreasing=TRUE),] # this insures complete string gets replaced in the text, rather than truncated bits ToResegment_DF <- ResegAndOrig_DF[ResegAndOrig_DF$Status=="Resolved",] ResegmentedText <- Resegmenter(ToResegment_DF, Text_Tokenized_Cleaned,TextName) # write(ResegmentedText, file= paste0(TextName, "_Resegmented1.txt")) TokensToCheck <- UnsegmentedFlags_DF[!UnsegmentedFlags_DF$Unsegmented %in% ToResegment_DF$Unsegmented,] print("the following items could not be resegmented, check Wordlist or Ngrams for missing lemmata:") print(TokensToCheck) write.table(ToResegment_DF, file=paste0(TextName, "_TokensResolvedReseg1.csv")) write.table(TokensToCheck, file=paste0(TextName, "_TokensToCheck1.csv")) return(ResegmentedText) }else{ print("Sorry, no improvement") return(Text_Tokenized_Cleaned) } } Resegment_Universal_Tokens <- function(UnsegmentedFlags_DF, TokenDF_WithFlagFreq, Wordlist, Text_Tokenized_Cleaned, TextName){ #TextName <- deparse(substitute(Text_Tokenized_Cleaned)) #Resolved <- data.frame(Status= character(0), Resegmented= character(0), Unsegmented = character(0) ) # print("this is initialised Resolved:") # print(Resolved) TokenDF_WithFlagFreq$Var1 <- tolower(TokenDF_WithFlagFreq$Var1) TokenDF_WithFlagFreq <- TokenDF_WithFlagFreq[TokenDF_WithFlagFreq$Var1 %in% Wordlist$LEMMATA,] TokenDF_WithFlagFreq <- TokenDF_WithFlagFreq[nchar(as.character(TokenDF_WithFlagFreq$Var1))>2,] TokenDF_WithFlagFreq <- TokenDF_WithFlagFreq[order(TokenDF_WithFlagFreq$CleanFreq, decreasing = TRUE),] UnsegmentedFlags_DF <- unique(UnsegmentedFlags_DF) ResegmentedStrings <- UniversalResegmenter_Tokens(UnsegmentedFlags_DF, TokenDF_WithFlagFreq, Wordlist) if(!is.null(ResegmentedStrings)){ Resegmented_DF <- Resolver(ResegmentedStrings) #print(Resegmented_DF) Resegmented_DF$Unsegmented <- str_replace_all(Resegmented_DF$Unsegmented," ","") UnsegmentedFlags_DF$Unsegmented <- str_replace_all(UnsegmentedFlags_DF$Unsegmented," ","") Resegmented_DF$Resegmented <- str_remove_all(Resegmented_DF$Resegmented," ") ResegAndOrig_DF <- left_join(Resegmented_DF, UnsegmentedFlags_DF, by = "Unsegmented") print(ResegAndOrig_DF) ResegAndOrig_DF <- ResegAndOrig_DF[order(nchar(as.character(ResegAndOrig_DF$Unsegmented)), decreasing=TRUE),] # this insures complete string gets replaced in the text, rather than truncated bits ToResegment_DF <- ResegAndOrig_DF[ResegAndOrig_DF$Status=="Resolved",] ResegmentedText <- Resegmenter(ToResegment_DF, Text_Tokenized_Cleaned, TextName) #write(ResegmentedText, file= paste0(TextName, "_Resegmented2.txt")) TokensToCheck <- UnsegmentedFlags_DF[!UnsegmentedFlags_DF$Unsegmented %in% ToResegment_DF$Unsegmented,] print("** the following items could not be resegmented **, check Wordlist or Ngrams for missing lemmata:") print(TokensToCheck) write.table(ToResegment_DF, file=paste0(TextName, "_TokensResolvedReseg2.csv")) write.table(TokensToCheck, file=paste0(TextName, "_TokensToCheck2.csv")) return(ResegmentedText) }else{ print("Sorry, no improvement") return(Text_Tokenized_Cleaned) } } for (f in TextFiles) { FilePath <- paste0(directory,"/",f) print(FilePath) Text <- readtext(FilePath) Text <- Text$text #print(Text) TextName <- str_remove_all(f, ".txt") # change this to: str_remove_all(Text$doc_id, ".txt") print(TextName) # clean Text to extract ngrams (you will use the ngrams on the Text with xml later) TextForNgrams <- CorpusCleaner_ngramPrep(Text, TextName) #print(TextForNgrams) # you need to re-load the file you just created with the cleaner function to get it in the right format for the Ngram extraction function FilePath2 <- paste0( "./",TextName,"CLEAN_for_Ngrams.txt") print(FilePath2) TextForNgrams <- readtext(FilePath2) ### Extract character Ngrams : TextNgrams <- ExtractNgramsTok(TextForNgrams, Wordlist,TextName) #TextNgramsStream <- NgramStreamliner(TextNgrams, Wordlist) #TextNgramsStream <- TextNgramsStream[TextNgramsStream$freq != 0,] # remove zero-freq ngrams to make DF lighter TextNgrams_WithCorpusFreq <- left_join(TextNgrams, CorpusNgrams, by= "ngrams") colnames(TextNgrams_WithCorpusFreq) <- c("ngrams", "TextFreq", "conTextFreq") head(TextNgrams_WithCorpusFreq) TextNgrams_WithCorpusFreq[is.na(TextNgrams_WithCorpusFreq)] <- 0 ## add weights (needed in new version of PseudoTokenizer() TextNgrams_WithCorpusFreq$WeightedFreq <- TextNgrams_WithCorpusFreq$TextFreq + (TextNgrams_WithCorpusFreq$conTextFreq*0.25) TextNgrams_WithCorpusFreq$WeightedFreq2 <- TextNgrams_WithCorpusFreq$TextFreq + (TextNgrams_WithCorpusFreq$conTextFreq*0.51) ### SEGMENTATION # clean Text and replace avagraha with a to facilitate segmentation Text_clean <- TextCleanerForTokenization(Text, TextName) # segment: TextTokenized1 <- PseudoTokenizer(TextNgrams_WithCorpusFreq, Text_clean, TextName) TextTokenized1Clean <- TokenizedCleaner1(TextTokenized1, TextName) TextTokenized1Clean <- gsub("\\n", "\n ",TextTokenized1Clean, perl=TRUE) # add a space to facilitate detection of errors with 'flagger' function # Resegmentation to correct obvious mistakes TextTokenized2 <- Resegment_Initial_Flagged(TextNgrams_WithCorpusFreq, TextTokenized1Clean, TextName) TextTokenized2Clean <- TokenizedCleaner1(TextTokenized2, TextName) TextFlags <- QuickFlagger(TextTokenized2Clean, TextName) TextFlags <- FlagReducer(TextFlags) TextFlags <- str_replace_all(TextFlags, "^\\s", "") # to ensure same spacing as original Text (GetFlagFreq() add a space for flagging accuracy) TextFlags <- str_replace_all(TextFlags, "\\s$", "") # to ensure same spacing as original Text TextFlags <- str_replace_all(TextFlags, "[,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)|/]", "") # to ensure same spacing as original Text TextUnsegmented <- UnsegmentFlags(TextFlags) Text_Flagged_Unsegmented_DF <- data.frame(TextFlags, TextUnsegmented, stringsAsFactors = FALSE) colnames(Text_Flagged_Unsegmented_DF) <- c("OriginalSeg","Unsegmented") head(Text_Flagged_Unsegmented_DF) ## double check that col names match content Text_Reseg <- Resegment_Universal_Ngrams(Text_Flagged_Unsegmented_DF,TextNgrams_WithCorpusFreq,TextTokenized2Clean,TextName) # Text_ResegClean <- FinalCleaner(Text_Reseg,TextName) TextRESEGFlags <- QuickFlagger(Text_Reseg, TextName) TextRESEGFlags <- FlagReducer(TextRESEGFlags) TextRESEGFlags <- str_replace_all(TextRESEGFlags, "^\\s", "") # to ensure same spacing as original Text (GetFlagFreq() add a space for flagging accuracy) TextRESEGFlags <- str_replace_all(TextRESEGFlags, "\\s$", "") # to ensure same spacing as original Text TextRESEGFlags <- gsub("[,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)|/]","",TextRESEGFlags) TextRESEGUnsegmented <- UnsegmentFlags(TextRESEGFlags) TextRESEG_Flagged_Unsegmented_DF <- data.frame(TextRESEGFlags, TextRESEGUnsegmented, stringsAsFactors = FALSE) colnames(TextRESEG_Flagged_Unsegmented_DF) <- c("OriginalSeg","Unsegmented") ## create a token DF of the Text TextRESEG_TokenDF <- DataFrameTokens_CAPS(Text_Reseg, TextName) TextRESEG_TokenDFWithFlagFreq <- GetFlagFreq(TextRESEGFlags, TextRESEG_TokenDF, Text_Reseg,TextName) # check this line on caramavastu! ## add the token and flag x token frequencies to the FiveTextsDF TotalTokesDF <- full_join(ReferenceTokensDF, TextRESEG_TokenDFWithFlagFreq, by = "Var1") TotalTokesDF[is.na(TotalTokesDF)] <- 0 FiveTextsPlusText <- transmute(TotalTokesDF, Var1=Var1,Freq = Freq.x+Freq.y, FlaggedFreq = FlaggedFreq.x+FlaggedFreq.y, CleanFreq=CleanFreq.x+CleanFreq.y) Text_ResegTok <- Resegment_Universal_Tokens(TextRESEG_Flagged_Unsegmented_DF, ReferenceTokensDF, Wordlist, Text_Reseg,TextName) TextResegTokFlags <- Flagger(Text_ResegTok) ### FOLLOWING LINES ARE BEING EDITED TextResegTokFlags <- str_replace_all(TextResegTokFlags, "[,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)|/]", "") # to ensure same spacing as original TextResegTok TextResegTokFlags <- str_replace_all(TextResegTokFlags, "^\\s", "") # to ensure same spacing as original Text (GetFlagFreq() add a space for flagging accuracy) TextResegTokFlags <- str_replace_all(TextResegTokFlags, "\\s$", "") TextResegTokFlags <- str_replace_all(TextResegTokFlags, "[,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)|/]", "") # to ensure same spacing as original TextResegTok TextResegTokUnsegmented <- UnsegmentFlags(TextResegTokFlags) TextResegTok_Flagged_Unsegmented_DF <- data.frame(TextResegTokFlags, TextResegTokUnsegmented, stringsAsFactors = FALSE) colnames(TextResegTok_Flagged_Unsegmented_DF) <- c("OriginalSeg","Unsegmented") head(TextResegTok_Flagged_Unsegmented_DF) ## double check that col names match content TextResegTok_Reseg <- Resegment_Universal_Ngrams(TextResegTok_Flagged_Unsegmented_DF,TextNgrams_WithCorpusFreq,Text_ResegTok,TextName) TextResegTokRESEGFlags <- Flagger(TextResegTok_Reseg) #print(TextResegTokRESEGFlags) TextResegTokRESEGFlags <- str_replace_all(TextResegTokRESEGFlags, "[,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)|/]", "") # to ensure same spacing as original TextResegTok TextResegTokRESEGFlags <- str_replace_all(TextResegTokRESEGFlags, "^\\s", "") # to ensure same spacing as original Text (GetFlagFreq() add a space for flagging accuracy) TextResegTokRESEGFlags <- str_replace_all(TextResegTokRESEGFlags, "\\s$", "") TextResegTokRESEGFlags <- gsub("[,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)|/]","",TextResegTokRESEGFlags) TextResegTokRESEGUnsegmented <- UnsegmentFlags(TextResegTokRESEGFlags) TextResegTokRESEG_Flagged_Unsegmented_DF <- data.frame(TextResegTokRESEGFlags, TextResegTokRESEGUnsegmented, stringsAsFactors = FALSE) colnames(TextResegTokRESEG_Flagged_Unsegmented_DF) <- c("OriginalSeg","Unsegmented") ## create a token DF of the TextResegTok TextResegTokRESEG_TokenDF <- DataFrameTokens_CAPS(TextResegTok_Reseg, TextName) TextResegTokRESEG_TokenDFWithFlagFreq <- GetFlagFreq(TextResegTokRESEGFlags, TextResegTokRESEG_TokenDF, TextResegTok_Reseg,TextName) ## add the token and flag x token frequencies to the FiveTextResegToksDF TotalTokesDF <- full_join(ReferenceTokensDF, TextResegTokRESEG_TokenDFWithFlagFreq, by = "Var1") TotalTokesDF[is.na(TotalTokesDF)] <- 0 FiveTextResegToksPlusTextResegTok <- transmute(TotalTokesDF, Var1=Var1,Freq = Freq.x+Freq.y, FlaggedFreq = FlaggedFreq.x+FlaggedFreq.y, CleanFreq=CleanFreq.x+CleanFreq.y) TextResegTok_ResegTok <- Resegment_Universal_Tokens(TextResegTokRESEG_Flagged_Unsegmented_DF, ReferenceTokensDF, Wordlist, TextResegTok_Reseg,TextName) # TextResegTokRESEGFlags <- Flagger(Text_ResegTok) # TextResegTokRESEGFlags <- str_replace_all(TextResegTokRESEGFlags, "[,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)|/]", "") # to ensure same spacing as original TextResegTok # TextResegTokRESEGFlags <- str_replace_all(TextRESEGFlags, "^\\s", "") # to ensure same spacing as original Text (GetFlagFreq() add a space for flagging accuracy) # TextResegTokRESEGFlags <- str_replace_all(TextResegTokRESEGFlags, "\\s$", "") # TextResegTokRESEGFlags <- gsub("[,|\\.|:|;|!|\\?|\\[|\\]|\\(|\\)|/]","",TextResegTokRESEGFlags) # TextResegTokRESEGUnsegmented <- UnsegmentFlags(TextResegTokRESEGFlags) # TextResegTokRESEG_Flagged_Unsegmented_DF <- data.frame(TextResegTokRESEGFlags, TextResegTokRESEGUnsegmented, stringsAsFactors = FALSE) # colnames(TextResegTokRESEG_Flagged_Unsegmented_DF) <- c("OriginalSeg","Unsegmented") # ## create a token DF of the TextResegTok # TextResegTokRESEG_TokenDF <- DataFrameTokens_CAPS(Text_ResegTok, TextName) # TextResegTokRESEG_TokenDFWithFlagFreq <- GetFlagFreq(TextResegTokRESEGFlags, TextResegTokRESEG_TokenDF, Text_ResegTok,TextName) # ## add the token and flag x token frequencies to the FiveTextResegToksDF # TotalTokesDF <- full_join(ReferenceTokensDF, TextResegTokRESEG_TokenDFWithFlagFreq, by = "Var1") # TotalTokesDF[is.na(TotalTokesDF)] <- 0 # FiveTextResegToksPlusTextResegTok <- transmute(TotalTokesDF, Var1=Var1,Freq = Freq.x+Freq.y, FlaggedFreq = FlaggedFreq.x+FlaggedFreq.y, CleanFreq=CleanFreq.x+CleanFreq.y) # # TextResegTok_ResegTok <- Resegment_Universal_Tokens(TextResegTokRESEG_Flagged_Unsegmented_DF, ReferenceTokensDF, Wordlist, Text_ResegTok,TextName) Text_ResegTokClean <- FinalCleaner(TextResegTok_ResegTok,TextName) # uncomment the following lines to write file with output of intermediate segmentation phases (e.g. for evaluation purposes) # write(TextTokenized1, file=paste0(TextName, "_FirstSegemntation.txt")) # write(TextTokenized1Clean, file=paste0(TextName, "_FirstCleaning.txt")) # write(TextTokenized2, file=paste0(TextName, "_Reseg_Initial.txt")) # write(TextTokenized2Clean, file=paste0(TextName, "_SecondCleaning.txt")) # write(Text_Reseg, file=paste0(TextName, "_FirstNgramResegmentation.txt")) # write(Text_ResegTok, file=paste0(TextName, "_FirstTokenResegmentation.txt")) # write(TextResegTok_Reseg, file=paste0(TextName, "_SecondNgramResegmentation.txt")) # write(TextResegTok_ResegTok, file=paste0(TextName, "_SecondTokenResegmentation.txt")) Text_GramrelsAndHumans <- TransformForGramrelsAndHumans(Text_ResegTokClean,TextName) } system("say Process completed") } Segmenter(directory="FILL_WITH_PathToYourDirectory", Wordlist, BuddhFoundationalCorpusNgrams, FiveTextsTokensWithFlagFreq_DF) ## 4. LEMMATISER #the lemmatiser is still under development the version provided here will undergo substantial improvements. # the lemmatiser requires the "gramrel" output of the segmenter **split in to slements** with element id e.g. # to split the text, replace relevant sentence boundary markers with and # to insert count ID you could use this function (it is very slow on long text), other implementations may be more efficient XMLelementCounter <- function(elementName, XMLtext){ #elementName e.g. "s" or "page" or "doc", without angle brakets; XMLtext= Segmenter's output 'forGramrels" with XML element ; example: AbhidharmaKosaWithCountID <- XMLelementCounter("s", Abhidharmakosa_ForGramrels) String <- XMLtext TextName <- deparse(substitute(XMLtext)) n <- str_count(XMLtext, paste0('<', elementName, '>')) count_id <- 1 while (count_id <= n) { String <- str_replace(String, paste0('<', elementName, '>'), paste0('<', elementName,' = "', count_id, '">')) count_id <- count_id+1 } write(String, file = paste0(TextName,"WithCountId.txt")) return(String) } # put all the Gramrel files with element ID into a directory, and pass the path to directory as first argument of the Lemmatiser Function. # Create an Output Directory for the lemmatised corpus and use its path as last argument of the function. Lemmatiser_beta <- function(DirectoryToLemmatise, NonStemmedWordlist, OutputDirectory){ library(rlist) TextFiles <- dir(DirectoryToLemmatise) SentenceAnnotator <- function(TextName,TextForGramrels, NonStemmedWordlist, OutputDirectory){ # TextName <- deparse(substitute(TextForGramrels)) print(TextName) dir.create(paste0(TextName,"_LemmatisedByS")) library(rlist) TextForGramrels <- gsub("\\^ ", " ^", TextForGramrels) TextForGramrels <- gsub("-īt([iīy]) ", "-i ^it\\1 ", TextForGramrels) TextForGramrels <- gsub("-et([iīy]) ", "-a ^et\\1 ", TextForGramrels) TextForGramrels <- gsub(" ([nc])ā ", " \\1a a @ ", TextForGramrels) TextForGramrels <- gsub(" CET -([iīy]) ([^@].)", " c ^et\\1 \\2", TextForGramrels) TextForGramrels <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ]) \\^", "\\1 -a \\^", TextForGramrels, perl=TRUE) TextForGramrels <- gsub("([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ]|[a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])\\^", "\\1 \\^", TextForGramrels, perl=TRUE) Annotator <- function(SentenceDF,NonStemmedWordlist){ # print("this is sentenceDF") # print(SentenceDF) SentenceDF <- SentenceDF[,1:3] SentenceDF <-rownames_to_column(SentenceDF, var = "id") SentenceDF$word <- tolower(SentenceDF$word) SentenceDF$word <- gsub(" ","",SentenceDF$word) SentenceDF$suffix <- gsub("\\^","",SentenceDF$suffix) SentenceDFAnno <- SentenceDF %>% mutate("id"= as.numeric(id)-1) %>% mutate("dep.rel"= ifelse(str_detect(word,"@$"),"comp","NONE")) %>% mutate("dep.head"= ifelse(dep.rel=="comp",as.numeric(id)+1, 0)) %>% mutate("norm"= ifelse(str_detect(suffix,"^[aiī]k|[iī]t"), paste0(tolower(stem), gsub("^([aiī]k|[iī]t).*?$", "\\1", suffix) ), ifelse(nchar(as.character(stem)) > 0, as.character(tolower(stem)), as.character(word)))) #mutate("case" = gsub("^a[sḥś]?$|^o$|[iu][ḥr]?$","nomSing", suffix))%>% #mutate("lemma"= gsub("^(.*?)@$","\\1",word)) # mutate("lemma"= ifelse(str_detect(suffix,"^a[śḥ]?$|^o$"),paste0(tolower(stem), "a"),ifelse(dep.rel=="comp", gsub("@","",word), "YYYY"))) ## NORM AFTER THIS! SentenceDFAnno$norm <- gsub("[mṃ]?[mṃ](c)", "ñc",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("[mṃ]?[mṃ]([pb])", "m\\1",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("[mṃ]?[mṃ]([vsh])", "ṃ\\1",SentenceDFAnno$norm ) ## still need to normalise various spellings of sa[ṃṅ]gha SentenceDFAnno$norm <- gsub("satv","sattv",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^\\^ā","[aā]",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^\\^o","u",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^op([ae])","up\\1",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^\\^?ai","e",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^\\^e","[iī]",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^h$","hi",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^v$","vā",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^([nc])$","\\1a",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^puna[ḥrsś]$","punas",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^([ie])v$","\\1va", SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^eva[mṃ]$","evam", SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^\\^?aiva[mṃ]","evam",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^\\^?aitad","etad",SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^[eiī]t[iīy]$","iti", SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^[aā]?p[iīy]$","api", SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^tv$","tu", SentenceDFAnno$norm ) SentenceDFAnno$norm <- gsub("^(t?)ato$","\\1atas", SentenceDFAnno$norm ) # print("this is SentenceDFAnno") # print(SentenceDFAnno) # SentenceDFAnno2 <- SentenceDFAnno %>% mutate("lemma"= ifelse(str_detect(suffix,"..?@"), paste0(gsub("\\^|@","",norm), gsub("\\^|@","",suffix)), ifelse(str_detect(word, "^mayā$|^\\^?[aā]ha[mṃ]$|^me$|^mā[mṃ]$|^ma[td]$|^me$"), "aham", ifelse(str_detect(word, "^tvayā$|^tava$|^tv[āa][mṃ]$|^tva[td]$|^tvayi$|tubhya[mṃ]$"), "tva", ifelse(str_detect(word, "^vaya[mṃ]$|^na[ḥś]$|^\\^?[aā]smāka[mṃ]$|^\\^?[aā]smāsu$|^\\^?[aā]smābhi[ḥrśs]$"), "vayam", ifelse(str_detect(word, "^yuṣmā[nñtd]n?$|^yūya[mṃ]$|^va[ḥś]$|^yuṣmāka[mṃ]$|^yuṣmās[uv]$|^yuṣmābhi[ḥr]$"), "yūyam", ifelse(str_detect(word,"^eka[dt]$|^ekena$|^ekasya$|^ekeṣā[ṃm]$|^ekeṣ[uv]$|^ekāni?$|^ek[aā][mṃñ]$|^ek[eā]bhya[ḥś]?$|^ekasmā[td]$|^ekasmai$|^ekasmi[nṃmñ]?[nṃmñ]?$|^eke$|^ekāsu$|^eke$|^\\^?aika[dt]$|^\\^?aikena$|^\\^?aikasya$|^\\^?aikeṣā[ṃm]$|^\\^?aikeṣ[uv]$|^\\^?aikāni?$|^\\^?aik[aā][mṃñ]$|^\\^?aik[eā]bhya[ḥś]?$|^\\^?aikasmā[td]$|^\\^?aikasmai$|^\\^?aikasmi[nṃmñ]?[nṃmñ]?$|^\\^?aike$|^\\^?aikāsu$|^\\^?aike$"), "eka", ifelse(str_detect(word, "^s[oaā]ḥ?$|^e?te$|^e?ta[dtc]$|^e?tena$|^e?tasya$|^e?teṣā[ṃm]$|^e?teṣ[uv]$|^e?tāni?$|^e?t[aā][mṃñ]$|^e?t[eā]bhya[ḥś]?$|^e?tasmā[td]$|^e?tasmai$|^e?tasmi[nṃmñ][nṃmñ]?$|^e?tāsu$|^e?tāsā[mṃ]$|^\\^?aite$|^\\^?aita[dtc]$|^\\^?aitena$|^\\^?aitasya$|^\\^?aiteṣā[ṃm]$|^\\^?aiteṣ[uv]$|^\\^?aitāni?$|^\\^?ait[aā][mṃñ]$|^\\^?ait[eā]bhya[ḥś]?$|^\\^?aitasmā[td]$|^\\^?aitasmai$|^\\^?aitasmi[nṃmñ][nṃmñ]?$|^\\^?aitāsu$|$tāsā[mṃ]$|^tayo[ḥrs]$"), "tad", ifelse(str_detect(word, "^\\^?[iīe]da[mṃ]$|^\\^?[aiāīe]ya[mṃ]$|^\\^?[aā]nena$|^\\^?[aā]sy[āa]ḥ?$|^eṣā[mṃ]$|^\\^?aiṣā[mṃ]$|^\\^?[aā]smā[td]$|^\\^?[aā]smai$|^\\^?[aā]smi[nṃmñ][nṃmñ]?$|^\\^?[aā]nayo[ḥr]$|^\\^?[iī]m[aā][mṃ]$|^\\^?[eā]bhya[ḥś]?$|^\\^?[iīe]me$|^\\^?[iī]mau$|^\\^?[iīe]māḥ?$|^\\^?[aā]smā[nñtd]n?$"), "idam", ifelse(str_detect(word, "^k[oāe]$|^ka[dtḥś]$|^kena$|^kasya$|^keṣā[ṃm]$|^keṣ[uv]$|^kāni?$|^k[aā][mṃñ]$|^k[eā]bhya[ḥś]?$|^kasmā[td]$|^kasmai$|^kasmi[nṃmñ][nṃmñ]?$"), "kim", ifelse(str_detect(word, "^y[āoe]$|^ya[dtḥś]$|^yena$|^yasya$|^yeṣā[ṃm]$|^yeṣ[uv]$|^yāni?$|^y[aā][mṃñ]$|^y[eā]bhya[ḥś]?$|^yasmā[td]$|^yasmai$|^yasmi[nṃmñ][nṃmñ]?$"), "yad", ifelse(str_detect(word, "^\\^?[āa]sa[uv]$|^\\^?[aā]da[ḥśs]$|^\\^?[aā]mu[yn]ā$|^\\^?[aā]muṣ[my]ai$|^\\^?[aā]muṣmāt$|^\\^?[aā]muṣyā[ḥśrsmṃ]$|^\\^?[aā]muṣya$|^\\^?[aā]mūm[iy]|^\\^?[aā]m[ūī]ṣ[uv]|^\\^?[aā]m[ūī]bh|^\\^?[aā]mū[ḥśnñ]?$|^\\^?[aā]muyo[ḥr]$|^\\^?[aā]m[ūī]ṣā[ṃm]$|^\\^?[aā]mu[sṣ]mi[nṃmñ][nṃmñ]?$"), "adas", ifelse(str_detect(norm, "^([ykt])asmā[tdn]$"), gsub("^([ykt])asmā[tdn]$","\\1asmāt", norm ), ifelse(str_detect(norm, "^(samya|pṛtha|prā|tirya)[gkñṅ]$"), gsub("^(samya|pṛtha|prā|tirya)[gkñṅ]$","\\1k", norm ), # ifelse(str_detect(norm, "^([sṣ])a[tdcḍ]$"), gsub("^([sṣ])a[tdcḍ]$","\\1at", norm ), # ifelse(str_detect(norm, "(^aiva[mṃ]$|^eva[mṃ]$)"), gsub("(^aiva[mṃ]$|^eva[mṃ]$)","evam", norm ), # ifelse(str_detect(norm, "^puna[sḥr]$"), gsub("^puna[sḥr]$","punas", norm ), # ifelse(str_detect(norm, "^āyu([ḥsṣr])$"), gsub("^āyu([ḥsṣr])$","ayu", norm ), # '-us' words are lemmatised to u to simplify PoS tag # ifelse(str_detect(norm, "^k.*?ci[ntd]$"), gsub("^k.*?ci[ntd]$","kimcid", norm ), # ifelse(str_detect(norm, "^[āa]voca[td]$"), gsub("^[āa]voca[td]$","vac", norm ), #ifelse(str_detect(norm, "^katha[mṃ]$"), gsub("^katha[mṃ]$","katham", norm ), # ifelse(str_detect(norm, "^yathāva[tdcn]$"), gsub("^yathāva[tdcn]$","yathāvat", norm ), ifelse(str_detect(norm, "^(catu|āyu)[rḥśsṣ]$"), gsub("^(catu|āyu)[rḥśsṣ]$","\\1u", norm ), # ifelse(str_detect(norm, "^([yt][āa]v[aā])[dtcn]$"), gsub("^([yt][āa]v[aā])[dtcn]$","\\1t", norm ), #ifelse(str_detect(norm, "^pṛtha[kgñṅ]$"), gsub("^pṛtha[kgñṅ]$","pṛthag", norm ), #ifelse(str_detect(norm, "^prā([gkñṅ])$"), gsub("^prā([gkñṇ])$","prāg", norm ), # ifelse(str_detect(norm, "^[aā]p[iīy]$"), gsub("^[aā]p[iīy]$","api", norm ), # ifelse(str_detect(norm, "^(d[ṛi])[gkś]$"), gsub("^(d[ṛi])[gkś]$","\\1ś", norm ), # ifelse(str_detect(norm, "^(sākṣā|paścā)[tdn]$"), "^(sākṣā|paścā)[tdn]$","\\1t", norm ), ifelse(str_detect(norm, "^arhan?[tn]$"), gsub("^arhan?[tn]","arhat", norm ), ifelse(str_detect(norm, "^parṣad[td]$"), gsub("^parṣad[td]$","parṣat", norm ), ifelse(str_detect(norm, "^sa[mṃ]pa[td]$"), gsub("^sa[mṃ]pa[td]$","sampat", norm ), # ifelse(str_detect(norm, "^(bh[uū])[td]$"), gsub("^(bh[uū])[td]$","\\1t", norm ), ifelse(nchar(as.character(stem)) == 0, norm, ifelse(stem=="MAHAT", "mahat", ifelse(stem=="MAH" & str_detect(suffix,"^[āa][nñ]$|^ant|^at[iīieāo]?$|^[aā]n?tā[mṃ]$|^āṃs|^ā$|^ata[ḥś]?$|^ats[uv]$"), "mahat", ifelse(stem=="BHAGAV" & str_detect(suffix,"^[āa][nñ]$|^ant|^at[iīieāo]?$|^an?tā[mṃ]$|^āṃs|^ā$|^ata[ḥś]?$|^ats[uv]$"), "bhagavat", ifelse(str_detect(word,"^sa[dt]$|^sann?$"), "sat", ifelse(stem=="SAMAY" & suffix=="e", "samaya", ifelse(str_detect(stem,"(NĀM|ĀTM|KARM)") & str_detect(suffix,"a@|^a[nñ]?$|^[āa]?n[āaeiyī]|^ān[iy]$|^no[ḥr]]$|^nā[mṃ]$|^abhi[ḥr]?$|^as[uv]$|^abhya[ḥś]$"), paste0(norm,"an"), # ifelse(stem=="ĀTM" & str_detect(suffix,"^a[nñ]?$|^[āa]?n[āeiyī]$|^ān[iy]$|^no[ḥr]]$|^nā[mṃ]$|^abhi[ḥr]?$|^as[uv]$|^abhya[ḥś]$"), "ātman", # ifelse(stem=="KARM" & str_detect(suffix,"^a[nñ]?|^[aā]?[nṇ][āeiyī]$|^ān[iy]$|^no[ḥr]]$|^nā[mṃ]$|^abhi[ḥr]?$|^as[uv]$|^abhya[ḥś]$"), "karman", #"YYY" ifelse(stem=="RĀJ" & str_detect(suffix,"a@|^ā$|^an?$|^ña[śḥ]$|^[ñn]o$|^ñ[āeiyī]$|^āna[ḥśmṃs]$|^ño[ḥr]]$|^ñā[mṃ]$|^abhi[ḥr]?$|^as[uv]$|^abhya[ḥś]$"), "rājan", ifelse(stem=="HET" & str_detect(suffix,"^o[ḥr]?$|^v"), "hetu", ifelse(stem=="CAKṢ" & str_detect(suffix,"^[uūv]"), "cakṣu", ifelse(stem=="VĀC" & str_detect(suffix,"^[āei]$|^a[mṃḥśsu]$|^o[ḥr]$|^ā[mṃ]$"), "vāk", ifelse(str_detect(word,"^vā[kṅg]$|^vāgbhi[rḥ]$|^vāgbhy[aā].?$|^vāk[ṣs]u$"), "vāk", ifelse(stem=="TIṢṬHAT" , "sthā", ifelse(str_detect(stem,"(^MANAS$|^MANO$)") , "manas", # ifelse(stem=="MANO" , "manas", ifelse(stem=="MAN" & str_detect(suffix,"^[ao]@?$|^āṃs|^obh"), "manas", ifelse(stem=="ŚIRAS" , "śiras", ifelse(str_detect(suffix,"^a[śḥ]?$|^o$|^e[nṇ]a?$|^a[mṃ]$|^asya?$|^āt$|^āya$|^eṣ[uv]$|^[eā]bhya[ḥś]?$|^ai[rḥś]$|^ā[nṇ][iy]?$"),paste0(norm, "a"), ifelse(str_detect(suffix,"^[ie][ḥrśs]$|^i[mṃ]$|^ibhya[ḥśs]?$|^aye$|^ibhi[ḥrsś]?$|^i[nṇ]e$"), paste0(norm, "i"), ifelse(str_detect(suffix,"^ī[mṃ]$|^ībhya[ḥś]?$|^īṣ[uv]$|^ībhi[ḥr]?$"), paste0(norm, "ī"), ifelse(str_detect(suffix,"^u[ḥrśmṃ]$|^u[nṇ][āie]?$|^u[nṇ]ā[mṃ]$|^ū[nṇñ][iy]?$|^ubhya[ḥśs]?$|^ubhi[ḥr]?$|^uṣ[uv]$|^uno[ḥr]?$"), paste0(norm, "u"), ifelse(str_detect(suffix,"^ū[ḥrśmṃ]$|^ś[nṇ]ā[mṃ]$|^ūbhya[ḥśs]?$|^ūbhi[ḥrs]?$|^ūṣ[uv]$"), paste0(norm, "ū"), ifelse(str_detect(suffix,"y?[aoe]n?t[iyeu]$|y?[ao]s[iyeu]$|y?ām[iyeau]$|y?[iī]tvā$|y?[iī]tu[mṃ]?|^y?amān|^e[td]$|^erann?$|^yu[rḥsś]$|^ā[vm]a[ḥh]|^a?tha$|^a?dhve$|^a?dhva[mṃ]$|^eme$|^.?tavy[aāoe].?.?$"), norm, ifelse(str_detect(suffix,"^[aiī][vm][aā][tdn]"), paste0(tolower(stem), gsub("^([aiī][vm])[aā][tdn].*?$", "\\1at", suffix) ), ifelse(str_detect(suffix,"^āyā[ḥm]?$|^ā[mṃ]$|^āyai$|^ās[uv]$|^ābhi"), paste0(norm, "ā"), "YYYY") ) ) ) ) ) ))))))))))))))))))))))))))))))))))))#))#)#)#))#))))#)#)))#))) # print("this is SentenceDFAnno2") # print(SentenceDFAnno2) SentenceDFAnno2$lemma <- gsub("\\^","", SentenceDFAnno2$lemma ) SentenceDFAnno2$lemma <- gsub( "^ni[ḥrśsṣ]$","ni_neg", SentenceDFAnno2$lemma) SentenceDFAnno2$lemma <- gsub( "^du[ḥrśsṣ]$","dus", SentenceDFAnno2$lemma) SentenceDFAnno2$lemma <- gsub("^[aā]n@$","a", SentenceDFAnno2$lemma ) SentenceDFAnno2$lemma <- gsub("y$","[iī]", SentenceDFAnno2$lemma ) SentenceDFAnno2$lemma <- gsub("@","", SentenceDFAnno2$lemma ) ToLemmatise <- which(SentenceDFAnno2$lemma=="YYYY"|str_detect(SentenceDFAnno2$lemma,"\\[")) for (tl in ToLemmatise){ Stem <- SentenceDFAnno2[tl, 7] #SentenceDFAnno2[tl, 8] <- paste(NonStemmedWordlist$LEMMATA[str_which(NonStemmedWordlist$LEMMATA, paste0("^", Stem, "..?$"))], collapse=",") SentenceDFAnno2[tl, 8] <- paste(NonStemmedWordlist$LEMMATA[str_which(NonStemmedWordlist$LEMMATA, paste0("^", Stem, "[aāiīuūkdṛ][stn]?$"))], collapse=",") if ( str_detect(SentenceDFAnno2[tl, 8],",")){ SentenceDFAnno2[tl,8] <- ifelse(str_detect(SentenceDFAnno2[tl,2], "[tṭ]vā$"), gsub("^(.*?)[iī]?[tṭ]","\\1", SentenceDFAnno2[tl,7], perl=TRUE), ifelse(str_detect(SentenceDFAnno2[tl,4], "^[uū]|^a?v"), paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(SentenceDFAnno2[tl,7],"([uū],|[uū]$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), ifelse(str_detect(SentenceDFAnno2[tl,4], "^e$") ,paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(SentenceDFAnno2[tl,7],"([ia],|[ia]$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), ifelse(str_detect(SentenceDFAnno2[tl,3], "^\\^?ĀH") & str_detect(SentenceDFAnno2[tl,4], "^[aā]$"), "āha", ifelse(str_detect(SentenceDFAnno2[tl,4],"^@$") & str_detect(SentenceDFAnno2[tl+1,2],"^\\^(ā|o|e|au|ai)"), paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(gsub("\\^|@","",SentenceDFAnno2[tl,7]), gsub("\\^|@","",SentenceDFAnno2[tl,4]),"([aā],|[aā]$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), ifelse(str_detect(SentenceDFAnno2[tl,4],"^@$") & str_detect(SentenceDFAnno2[tl+1,2],"^\\^ī$"), paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(gsub("\\^|@","",SentenceDFAnno2[tl,7]), gsub("\\^|@","",SentenceDFAnno2[tl,4]),"([iī],|[iī]$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), ifelse(str_detect(SentenceDFAnno2[tl,4],"^@$") & str_detect(SentenceDFAnno2[tl+1,2],"^\\^ū"), paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(gsub("\\^|@","",SentenceDFAnno2[tl,7]), gsub("\\^|@","",SentenceDFAnno2[tl,4]),"([uū],|[uū]$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), ifelse(str_detect(SentenceDFAnno2[tl,4], "^ā[ḥśs]$|^ā[nṇ]ā[mṃ]?$") ,paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(SentenceDFAnno2[tl,7],"([āa],|[āa]$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), ifelse(str_detect(SentenceDFAnno2[tl,4], "^ā$") ,paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(SentenceDFAnno2[tl,7],"([āaṛ],|[āaṛ]$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), ifelse(str_detect(SentenceDFAnno2[tl,4], "^y") ,paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(SentenceDFAnno2[tl,7],"([iī],|[iī]$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), ifelse(str_detect(SentenceDFAnno2[tl,4], "^[iī][nṇ]?") ,paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(SentenceDFAnno2[tl,7],"([īi]n?,|[īi]n?$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), ifelse(str_detect(SentenceDFAnno2[tl,4], "^a") , paste(unlist(regmatches(SentenceDFAnno2[tl,8], gregexpr(paste0(SentenceDFAnno2[tl,7],"(a,|a$)"),SentenceDFAnno2[tl,8], perl=TRUE))), collapse=","), SentenceDFAnno2[tl,8] )))))))))))) SentenceDFAnno2[tl,8] <- gsub(",$","", SentenceDFAnno2[tl,8]) SentenceDFAnno2[tl,8] <- gsub(",,",",", SentenceDFAnno2[tl,8]) SentenceDFAnno2[tl,8] <- gsub(",","_", SentenceDFAnno2[tl,8]) } } ## add PoS tags SentenceDFAnno2$lemma <- gsub("(^[aā]st?[iīyu]$|^syā[td]$|^sy[uū][ḥr]$)","as", SentenceDFAnno2$lemma ) SentenceDFAnno2 <- SentenceDFAnno2 %>% mutate( "PoS"= ifelse(str_detect(lemma, "(^[yt]ad$|^kim$|^idam$|^aham$|^tva$|^adas$|^vayam$|^yūyam$|^[ykt]asmāt|cid$|^sva$)" ), "PRON", ifelse(nchar(as.character(stem))==0 & !str_detect(lemma, "(avocat|^āha$|^as$)"), "IND", ifelse(str_detect(lemma, "(avocat|^āha$|^as$)"), "VERB", ifelse(str_detect(suffix,"y?[aoe]n?t[iyeu]$|y?[ao]s[iyeu]$|y?ām[iyeau]$|y?[iī]tvā$|y?[iī]tu[mṃ]?^e[td]$|^erann?$|^yu[rḥ]$|^ā[vm]a[ḥh]|^a?tha$|^a?dhve$|^a?dhva[mṃ]$|^eme$"), "VERB", "Noun/Adj") ### the rule above does not catch cases of IT-vā, nor compounded verbs gerunds in -ya ; participles pres and past are excluded by design as they behave like adjectives! ) ) ) ) SentenceDFAnno2 <- SentenceDFAnno2 %>% mutate( "ToCheck"= ifelse(str_detect(lemma, "_"), lemma, ifelse(str_detect(suffix,"^ā@$"), paste0(norm,"[aā] + alpha privativum?"), "") ) ) LemmaToChoose <- which(str_detect(SentenceDFAnno2$lemma,"_")) ClearLem <- unlist(regmatches(SentenceDFAnno2$lemma, gregexpr("^([a-z]|[āīūṛḷṇḍñṅḥśṣṭḍṃ])+$", SentenceDFAnno2$lemma, perl=TRUE))) ClearLemTable <- as.data.frame(table(ClearLem)) ClearLemTable <- ClearLemTable[order(ClearLemTable$Freq, decreasing = TRUE),] for ( tc in LemmaToChoose){ Lemma <- SentenceDFAnno2[tc, 8] LemmaChoices <- gsub("_", "$\\|^", Lemma) LemmaChoices <- gsub("^", "\\^", LemmaChoices) LemmaChoices <- gsub("$", "\\$", LemmaChoices) # print(paste0("LemmaChoices: ", LemmaChoices)) LemmaDF <- ClearLemTable[str_detect(as.character(ClearLemTable$ClearLem), LemmaChoices),] if(nrow(LemmaDF)>0){ LemmaDF <- droplevels(LemmaDF) # print(paste0("LemmaDF: ", LemmaDF)) # print(paste0("as.character(LemmaDF[1,1]): ", as.character(LemmaDF[1,1]))) # SentenceDFAnno2[tc, 8] <- as.character(LemmaDF[1,1]) } } SentenceDFAnno2$lemma <- gsub("(^karma$|^rāja$|^ātma$)","\\1n", SentenceDFAnno2$lemma ) SentenceDFAnno2$lemma <- gsub("^arha$","arhat", SentenceDFAnno2$lemma ) SentenceDFAnno2$lemma <- gsub("(^[aā]st?[iīyu]$|^syā[td]$|^sy[uū][ḥr]$)","as", SentenceDFAnno2$lemma ) # print(SentenceDFAnno2) return(SentenceDFAnno2) } #TextName <- gsub("_Vert.txt", "", TexName) Verticalizer <- function(TextForGramrels){ # TextName <- deparse(substitute(TextForGramrels)) #TextName <- gsub("_ForGramrels.txt", "", TexName) Vert <- gsub(">", ">\n", TextForGramrels) Vert <- gsub("- @ (Ā|E|O|Ū|AI|AU)", "- @ ^\\1", Vert) Vert <- gsub(" @", "@", Vert) Vert <- gsub("\\s+", ", ", Vert, perl=TRUE) Vert <- gsub(", (?!-)(.)", ",,\n\\1", Vert, perl=TRUE) Vert <- gsub("(-.*?),,", "\\1,", Vert, perl=TRUE) Vert <- gsub("(.*?),,","\\1,,,",Vert) Vert <- gsub("(.*?), -?(.*?),","\\L\\1\\2,\\U\\1,\\L\\2,", Vert, perl=TRUE) Vert <- gsub(">,+",">", Vert, perl=TRUE) ## order of columns: word (as it appears text, compounded words followed by '@'); STEM, suffix, #write(Vert, file=paste0(TextName," ")) return(Vert) } Verticalized <- Verticalizer(TextForGramrels) # print(Verticalized) SplitVert <- gsub("","SPLITHERE", Verticalized, perl=TRUE) # this is to enable the split SplitByS <- str_split(SplitVert, "SPLITHERE") SplitByS <- unlist(SplitByS) sID <- 1 AllAnnoSent <- list() for (s in SplitByS){ Sentence <- gsub("<.?.*?>", "", s) SenteceSplit <- str_split(Sentence, "\n") # print(paste0("SenteceSplit: ", SenteceSplit)) for( i in SenteceSplit){ LinesDF <- data.frame(matrix(ncol = 4)) colnames(LinesDF) <- c("word", "stem", "suffix","X") # print(LinesDF) Lines <- unlist(i) for (l in Lines){ Line <- as.vector(l) LineSplit <- str_split(Line, ",") LineSplit <- unlist(LineSplit) #print(nchar(LineSplit[1])) if(length(LineSplit)==4 & nchar(LineSplit[1]) >0){ #print(LineSplit) LinesDF <- rbind(LinesDF,LineSplit) #print(paste0("LinesDF at line 236:", LinesDF)) } } LinesDF <- na.omit(LinesDF) if(nrow(LinesDF)>0){ AnnotatedSent <- Annotator(LinesDF, NonStemmedWordlist) # print(sID) # print(AnnotatedSent) LinesList <- list("S_ID"=sID, "text"=AnnotatedSent) # print(LinesList) AllAnnoSent <- list.append(AllAnnoSent, LinesList) # #write.table(as.data.frame(AnnotatedSent),file= paste0("./",TextName,"_LemmatisedByS",TextName,"_S",sID,"_Annotated.txt"), quote=F, sep=",", row.names=F) write.table(as.data.frame(AnnotatedSent),file= paste0("./",TextName,"_LemmatisedByS/",sID), quote=F, sep=",", row.names=F) sID <- sID +1 } } #write(unlist(AllAnnoSent), file="SentencesAnnotated.txt") } AnnoFiles <- dir(paste0("./",TextName,"_LemmatisedByS/")) AnnoFiles <- sort(as.numeric(AnnoFiles, decreasing=FALSE)) for (f in AnnoFiles){ t <- readtext(paste0("./",TextName,"_LemmatisedByS/", f)) t <- t$text ID <- str_extract(f, "\\d+") xmlID <- paste0('') t <- paste(xmlID, t, sep = "\n\n") t <- paste(t,'\n', sep = "\n\n") cat(t, file= paste0("./",TextName,"_LemmatisedByS/",TextName,"_Lemmatised.txt"), sep = "\n", append = TRUE) } LemmatizedText <- readtext(paste0("./",TextName,"_LemmatisedByS/",TextName,"_Lemmatised.txt")) LemmatizedText <- LemmatizedText$text LemmatizedText <- gsub("id,word,stem,suffix,dep.rel,dep.head,norm,lemma,PoS,ToCheck", "", LemmatizedText) Header <- str_extract(TextForGramrels, "") #LemmatizedText <- paste(Header, LemmatizedText, sep="\n") LemmatizedText <- paste(Header, LemmatizedText, sep="\n\n") LemmatizedText <- paste(LemmatizedText, "\n", sep="\n") # write(LemmatizedText, file= paste0(OutputDirectory,"/",TextName,"_Lemmatised.txt"), append=FALSE ) return(AllAnnoSent) } for (f in TextFiles) { FilePath <- paste0(DirectoryToLemmatise,"/",f) print(FilePath) Text <- readtext(FilePath) Text <- Text$text #print(Text) TextName <- str_remove_all(f, ".txt") # change this to: str_remove_all(Text$doc_id, ".txt") SentenceAnnotator(TextName, Text, NonStemmedWordlist, OutputDirectory) unlink(paste0(TextName,"_LemmatisedByS"), recursive=TRUE) } system("say Your corpus is lemmatized") } ### 5. EVALAUTION # Segmenter Evaluation # download folder RawOneSentencePerLine (file names ending in ToSeg.txt), which a random selection of sentneces from various Buddhist texts # put it in your working directory and segment it: Segmenter(directory="./RawOneSentencePerLine", Wordlist, BuddhFoundationalCorpusNgrams, FiveTextsTokensWithFlagFreq_DF) ### read segmented texts from you working directory AbhidharmakosaSeg <- read.csv("./AkosExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) BodhisattvabhumiSeg <- read.csv("./BBhumiExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) BhaisajyaGuruSeg <- read.csv("./BhaisajyaguruExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) BhavasamkrantiSeg <- read.csv("./bhavasamkrantiToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) DasabhumikaSeg <- read.csv("./DasabhumikaExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) KasyapaparivartaSeg <- read.csv("./KasyapaExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) LankavataraSeg <- read.csv("./LankaExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) MahayanasutralamkaraSeg <- read.csv("./Test_MSA_ExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) PrasannapadaSeg <- read.csv("./PrasannapadaExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) RatnagunasamcayagathaSeg <- read.csv("./Test_RatnaGunaSG_ExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) SaddharmapundarikaSeg <- read.csv("./Test_Saddh_ExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) VariouaSutraSeg <- read.csv("./variossutraExcerptsToSeg_Final.txt", stringsAsFactors = FALSE, header=FALSE) ## bind the segmenter texts together Final_SegClean <- rbind(AbhidharmakosaSeg,BodhisattvabhumiSeg,BhaisajyaGuruSeg,BhavasamkrantiSeg, DasabhumikaSeg,KasyapaparivartaSeg, LankavataraSeg, MahayanasutralamkaraSeg, PrasannapadaSeg, RatnagunasamcayagathaSeg, SaddharmapundarikaSeg, VariouaSutraSeg) #write.csv(Final_SegClean, file="Final_SegClean_July19.csv") ## download and read into Gold set of manually segmented and stemmed senteces: Gold <- read.csv("./Lugli_Segmenter_Eva_AllGoldSent.csv", stringsAsFactors = FALSE, header=FALSE ) All_GoldSeg_DF <- data.frame(Gold, Final_SegClean) colnames(All_GoldSeg_DF) <- c("Gold", "Seg") # run functions required for evaluation DataFrameTokens_CAPS <- function(Cleaned_Segmented_Text){ Textname <- deparse(substitute(Cleaned_Segmented_Text)) Text_TokensDF <- as.data.frame(table(unlist(str_extract_all(Cleaned_Segmented_Text,"-([A-Z]|[ĀĪŪṚḶṆḌÑṄḤŚṢṬḌṂ])+-")))) write.table(Text_TokensDF,file= paste(Textname,"TokensDF.csv", sep=""), quote=F,sep=",",row.names=F) return(Text_TokensDF) } Gold_Seg_Evaluator <- function(Gold_SegDF){ Precision <- c(length=nrow(Gold_SegDF)) Recall <- c(length=nrow(Gold_SegDF)) Accuracy <- c(length=nrow(Gold_SegDF)) F1 <- c(length=nrow(Gold_SegDF)) PrecisionLowercase <- c(length=nrow(Gold_SegDF)) RecallLowercase <- c(length=nrow(Gold_SegDF)) AccuracyLowercase <- c(length=nrow(Gold_SegDF)) F1lowercase <- c(length=nrow(Gold_SegDF)) # Correct <-c(length=nrow(Gold_SegDF)) # Wrong <-c(length=nrow(Gold_SegDF)) for (i in 1:nrow(Gold_SegDF)){ # evaluate stem tokens by comparing uppercase strings: GoldDF <- DataFrameTokens_CAPS(Gold_SegDF[i,1]) SegDF <- DataFrameTokens_CAPS(Gold_SegDF[i,2]) # print( head(SegDF)) # print( head(GoldDF)) Gold_Seg_TokenDF <- full_join(SegDF, GoldDF, by="Var1") colnames(Gold_Seg_TokenDF) <- c("token", "seg", "gold") Gold_Seg_TokenDF[is.na(Gold_Seg_TokenDF)] <- 0 Gold_Seg_TokenDF <- Gold_Seg_TokenDF %>% mutate(Error= seg - gold) %>% mutate(Correct= ifelse(seg==gold, gold, ifelse(seg>gold,seg-Error, gold+Error))) %>% mutate(FalsePos= ifelse(seg>gold, seg-gold, 0)) %>% mutate(FalseNeg= ifelse(seg% mutate(Error= seg - gold) %>% mutate(Correct= ifelse(seg==gold, gold, ifelse(seg>gold,seg-Error, gold+Error))) %>% mutate(FalsePos= ifelse(seg>gold, seg-gold, 0)) %>% mutate(FalseNeg= ifelse(seg1){ # this means that the ngram in a subpart of longer ngrams GrepDF_Ordered <- GrepDF[order(nchar(as.character(GrepDF$ngrams)), decreasing = TRUE),] #print(paste0("grepped : ",i , " before reducing:")) #print(GrepDF_Ordered) GrepDF_Redux <- ngrams_Reducer(GrepDF_Ordered) #the mini DF with subtracted freq for shorter strings contained in longer strings #print("after reducing") #print(GrepDF_Redux) anyPseudoLemmatisedNgrams_DF[which(anyPseudoLemmatisedNgrams_DF$ngrams==StartWord), 2] <- GrepDF_Redux[which(GrepDF_Redux$ngrams==StartWord), 2] # the updated Master DF } } #StreamLinedToReturn <- anyPseudoLemmatisedNgrams_DF[!anyPseudoLemmatisedNgrams_DF[,2]==0 ,] #StreamLinedToReturn <- StreamLinedToReturn[StreamLinedToReturn$ngrams %in% StemmedWordlist$LEMMATA, ] #write.table(as.data.frame(StreamLinedToReturn),file= paste(DFname,"Redux.csv", sep=""), quote=F,sep=",",row.names=F) write.table(as.data.frame(anyPseudoLemmatisedNgrams_DF),file= paste(DFname,"Redux.csv", sep=""), quote=F,sep=",",row.names=F) system("say Done") return(anyPseudoLemmatisedNgrams_DF) #return(StreamLinedToReturn) }