This function automatically identify sequences of tokens. This algorithm is based on Blaheta and Johnson's “Unsupervised Learning of Multi-Word Verbs”.
findSequences(x, tokens, count_min, smooth = 0.001, nested = TRUE)
Blaheta, D., & Johnson, M. (2001). Unsupervised learning of multi-word verbs. Presented at the ACLEACL Workshop on the Computational Extraction, Analysis and Exploitation of Collocations.
sents <- as.character(tokens(data_corpus_inaugural[1:10], what = "sentence")) tokens <- tokens(sents, removePunct = TRUE) tokens <- tokens_select(tokens, stopwords("english"), "remove", padding = TRUE) types <- unique(as.character(tokens)) # extracting multi-part nouns types_upper <- types[stringi::stri_detect_regex(types, "^([A-Z][a-z\\-]{2,})")] seqs <- findSequences(as.tokenizedTexts(tokens), types_upper, count_min = 2) head(seqs, 10)#> sequence len z p mue #> 5 National Government 2 -1.088709 0.8618589 -5.225623 #> 6 New States 2 -1.088709 0.8618589 -5.225623 #> 3 General Government 2 -1.335725 0.9091805 -5.078044 #> 7 United States 2 -1.568850 0.9416585 -4.926392 #> 1 Chief Magistrate 2 -1.779789 0.9624448 -7.306344 #> 2 Federal Constitution 2 -1.779789 0.9624448 -7.306344 #> 4 Great Britain 2 -2.055167 0.9800685 -7.658025# types can be any words types_lower <- types[stringi::stri_detect_regex(types, "^([a-z]+)$") & !types %in%stopwords()] seqs2 <- findSequences(as.tokenizedTexts(tokens), types_lower, count_min = 3) head(seqs2, 10)#> sequence len z p mue #> 9 good opinion 2 -2.010768 0.9778250 -5.707257 #> 22 public opinion 2 -2.010768 0.9778250 -5.707257 #> 7 foreign powers 2 -2.464523 0.9931402 -6.473839 #> 10 good sense 2 -2.464523 0.9931402 -6.473839 #> 13 human mind 2 -2.464523 0.9931402 -6.473839 #> 14 human nature 2 -2.464523 0.9931402 -6.473839 #> 21 public debt 2 -2.573772 0.9949702 -6.454851 #> 6 foreign nations 2 -2.774800 0.9972382 -6.409861 #> 1 ardent patriotism 2 -2.962134 0.9984724 -8.874784 #> 2 equal rights 2 -2.962134 0.9984724 -8.874784