This function automatically identifies contiguous collocations consisting of variable-length term sequences whose frequency is unlikely to have occurred by chance. The algorithm is based on Blaheta and Johnson's (2001) "Unsupervised Learning of Multi-Word Verbs".
#' @export #' @method as.tokens sequences #' @noRd as.tokens.sequences <- function(x) toks <- attr(x, 'tokens') attr(toks, 'types') <- attr(x, 'types') class(toks) <- c("tokens", "tokenizedTexts") return(toks)
sequences(x, min_count = 2, size = 2, method = c("lambda", "lambda1"), smoothing = 0.5) is.sequences(x)
x | a tokens object |
---|---|
min_count | minimum frequency of sequences for which parameters are estimated |
size | length of collocations, default is 2. Can be set up to 5. Use c(2,n) or 2:n to return collocations of bigram to n-gram collocations. |
method | default is "lambda" and option is "lambda1" |
smoothing | default is 0.5 |
sequences
returns TRUE
if the object is of class
sequences, FALSE
otherwise.
Blaheta, D., & Johnson, M. (2001). Unsupervised learning of multi-word verbs. Presented at the ACLEACL Workshop on the Computational Extraction, Analysis and Exploitation of Collocations.
toks <- tokens(corpus_segment(data_corpus_inaugural, what = "sentence"), remove_punct=TRUE) toks <- tokens_select(toks, stopwords("english"), "remove", padding = TRUE) # extracting multi-part proper nouns (capitalized terms) toks <- tokens_select(toks, "^([A-Z][a-z\\-]{2,})", valuetype="regex", case_insensitive = FALSE, padding = TRUE) seqs <- sequences(toks, size = 2:3) head(seqs, 10)#> collocation count length lambda sigma dice pmi #> 39 United States 157 2 6.336118 0.4286172 0.9319527 1.278173 #> 191 Federal Government 32 2 3.646094 0.3721631 0.5603448 1.890407 #> 238 Almighty God 15 2 6.763808 0.8296562 0.8157895 3.333246 #> 106 Chief Justice 13 2 5.515845 0.7414561 0.6279070 3.012774 #> 220 South America 6 2 5.908988 0.8818373 0.6500000 3.755192 #> 108 Chief Magistrate 10 2 5.576293 0.9078090 0.5384615 3.049141 #> 136 Supreme Court 4 2 6.647688 1.1359777 0.6923077 4.244917 #> 66 Divine Providence 3 2 5.885548 1.0427227 0.5833333 4.147754 #> 202 National Government 11 2 2.836326 0.5092672 0.2584270 1.767805 #> 32 Old World 10 2 8.247744 1.5435425 0.8400000 3.792932 #> G2 chi2 z p #> 39 553.07890 536.87252 14.782696 0.000000e+00 #> 191 108.70100 187.04369 9.797032 0.000000e+00 #> 238 110.69891 428.66274 8.152543 2.220446e-16 #> 106 80.17375 265.57535 7.439207 5.062617e-14 #> 220 47.33845 273.31063 6.700769 1.036626e-11 #> 108 62.45004 213.65478 6.142584 4.059483e-10 #> 136 37.87774 311.12156 5.851953 2.429165e-09 #> 66 27.28012 218.58681 5.644404 8.287725e-09 #> 202 30.46242 53.48005 5.569427 1.277892e-08 #> 32 87.69169 462.90173 5.343386 4.561303e-08# to return only trigrams seqs <- sequences(toks, size=3) head(seqs, 10)#> collocation count length lambda sigma dice #> 15 President Carter President 2 3 0.08917943 2.603866 0.1785714 #> 25 Chief Justice President 2 3 -2.07378039 2.949803 0.2142857 #> 17 United States Congress 2 3 -2.44524139 3.328397 0.5769231 #> 12 President Bush President 2 3 -1.75664726 2.282146 0.1704545 #> 14 President Vice President 2 3 -2.65458886 2.253641 0.1630435 #> 71 Vice President Bush 2 3 -5.03120656 2.652830 0.1785714 #> pmi G2 chi2 z p #> 15 1.4381662 5.887414 7.896617 0.03424885 0.4863394 #> 25 0.5908683 1.056418 1.213391 -0.70302338 0.7589794 #> 17 3.2299257 17.502472 62.335640 -0.73466035 0.7687268 #> 12 0.9273406 2.444195 3.044970 -0.76973481 0.7792714 #> 14 1.4381662 5.887414 7.896617 -1.17791115 0.8805840 #> 71 1.3328057 4.266731 6.263170 -1.89654291 0.9710559