txt <- c(d1 = "a b c d e g h", d2 = "a b e g h i j")
toks <- tokens(txt)
coll_bi <- textstat_collocations(toks, method = "lr", max_size = 2)
Currently, users can chose concatenators of ngrams, but this is making consistent handling of tokens object difficult.
attr(tokens_ngrams(toks, n = 2), 'types')
## [1] "a b" "b c" "c d" "d e" "e g" "g h" "b e" "h i" "i j"
attr(tokens_ngrams(toks, n = 2, concatenator = " "), 'types')
## Warning in tokens_concatenate(x, concatenator): You are making your life
## difficult by creating unigrams with whitespaces
## [1] "a b" "b c" "c d" "d e" "e g" "g h" "b e" "h i" "i j"
So, concatenators of ngrams should be always the same character internally, making concatenator = NULL
by defult. The internal concatenator is a whitespace (or Unicode control character).
Concatenators should be chosen when tokens are printed:
print(tokens_ngrams(toks, n = 2), concatenator = ' ')
# tokens from 2 documents.
# d1 :
# [1] "a b" "b c" "c d" "d e" "e g" "g h"
#
# d2 :
# [1] "a b" "b e" "e g" "g h" "h i" "i j"
print(tokens_ngrams(toks, n = 2), concatenator = '_')
# tokens from 2 documents.
# d1 :
# [1] "a_b" "b_c" "c_d" "d_e" "e_g" "g_h"
#
# d2 :
# [1] "a_b" "b_e" "e_g" "g_h" "h_i" "i_j"
print(tokens_ngrams(toks, n = 2), concatenator = '+')
# tokens from 2 documents.
# d1 :
# [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h"
#
# d2 :
# [1] "a+b" "b+e" "e+g" "g+h" "h+i" "i+j"
Similary,
as.list(tokens_ngrams(toks, n = 2), concatenator = ' ')
# $d1
# [1] "a b" "b c" "c d" "d e" "e g" "g h"
#
# $d2
# [1] "a b" "b e" "e g" "g h" "h i" "i j"
as.list(tokens_ngrams(toks, n = 2), concatenator = '_')
# $d1
# [1] "a_b" "b_c" "c_d" "d_e" "e_g" "g_h"
#
# $d2
# [1] "a_b" "b_e" "e_g" "g_h" "h_i" "i_j"
as.list(tokens_ngrams(toks, n = 2), concatenator = '+')
# $d1
# [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h"
#
# $d2
# [1] "a+b" "b+e" "e+g" "g+h" "h+i" "i+j"
We will then create a new function tokens_concatenate()
(or tokens_unigrams()
), which converts ngram tokens into unigrams by changing internal concatenator to user’s concatenator, and reset the ngrams value.
attr(tokens_ngrams(toks, n = 2), 'ngrams') # 2
## [1] 2
attr(tokens_ngrams(toks, n = 2), 'types')
## [1] "a b" "b c" "c d" "d e" "e g" "g h" "b e" "h i" "i j"
# [1] "a b" "b c" "c d" "d e" "e g" "g h" "b e" "h i" "i j"
attr(quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2)), 'ngrams') # 1
## Warning in quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2)): You
## are making your life difficult by creating unigrams with whitespaces
## [1] 1
attr(quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2), concatenator = '+'), 'types')
## [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h" "b+e" "h+i" "i+j"
# [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h" "b+e" "h+i" "i+j"
We would allow tokens concateantion to happen within tokens_ngrams()
if concatenator is not NULL.
attr(tokens_ngrams(toks, n = 2, concatenator = '+'), 'ngrams') # 1
## [1] 1
attr(tokens_ngrams(toks, n = 2, concatenator = '+'), 'types')
## [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h" "b+e" "h+i" "i+j"
# [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h" "b+e" "h+i" "i+j"
Above changes will make behaviours of tokens_select()
and dfm_select()
more consistent, treating ngrams as sequences in dfm_select()
and spliting features input to select_tokens()
by whitespaces only when tokens are unigrams.
toks_bi <- tokens_ngrams(toks, n = 2)
toks_uni <- quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2), concatenator = '+')
# with bigrams
featnames(dfm_select(dfm(toks_bi), features = 'c d')) # "c d"
## [1] "c d"
tokens_select( toks_bi, features = 'c d')[[1]] # [1] "c d"
## [1] "c d"
# with bigrams as unigrams
featnames(dfm_select(dfm(toks_uni), features = 'c d')) # character(0) (and warning)
## character(0)
tokens_select( toks_uni, features = 'c d')[[1]] # character(0)
## character(0)
featnames(dfm_select(dfm(toks_uni), features = 'c+d')) # "c+d"
## [1] "c+d"
tokens_select( toks_uni, features = 'c+d')[[1]] # "c+d"
## [1] "c+d"
Users are free to concatenate ngrams with a whitespace using tokens_concatenate()
, but dfm_select()
and tokens_select()
will behave differently (as per current).
toks_uni2 <- quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2), concatenator = ' ')
## Warning in quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2),
## concatenator = " "): You are making your life difficult by creating
## unigrams with whitespaces
featnames(dfm_select(dfm(toks_uni2), features = 'c d')) # "c d"
## [1] "c d"
tokens_select( toks_uni2, features = 'c d')[[1]] # character(0)
## character(0)
In this densign, users need to use list()
(or sequences()
) in only special cases.
# select unigrams with a whitespace
tokens_select( toks_uni2, features = list('c d'))[[1]] # "c d"
## [1] "c d"
Based on the same priciple as ngrams, we will set concatenator = NULL by default in tokens_compound()
. Collocation and dictionary object works consistently as they are whitespace separated ngrams.
attr(tokens_compound(toks, coll_bi), 'types')
#[1] "c" "d" "i" "j" "a b" "e g" "g h"
attr(quanteda:::tokens_concatenate(tokens_compound(toks, coll_bi), concatenator = '_'), 'types')
#[1] "c" "d" "i" "j" "a_b" "e_g" "g_h"
attr(quanteda:::tokens_concatenate(tokens_compound(toks, coll_bi), concatenator = ''), 'types')
#[1] "c" "d" "i" "j" "ab" "eg" "gh"
Compounded tokens are treated as ngram tokens unless tokens_concatenate()
is applied.
toks_comp <- tokens_compound(toks, coll_bi)
attr(toks_comp, 'ngrams') # 1:2
attr(quanteda:::tokens_concatenate(toks_comp, concatenator = '_'), 'ngrams') # 1
dfm_select()
and tokens_select()
select the same feature.
featnames(dfm_select(dfm(toks_comp)), c('a b', 'c d')) # "a b"
tokens_select( toks_comp, c('a b', 'c d'))[[1]] # "a b"
Again, if users concatenate compounds with a whitespace, dfm_select()
and tokens_select()
will behave differently, but this is an intentional act.
toks_comp2 <- quanteda:::tokens_concatenate(tokens_compound(toks, coll_bi), concatenator = ' ')
featnames(dfm_select(dfm(toks_comp2)), c('a b', 'c d')) # "a b"
tokens_select( toks_comp2, c('a b', 'c d'))[[1]] # "c" "d"
tokens_select( toks_comp2, list('a b', 'c d'))[[1]] # "a b"
tokens_select( toks_comp2, list('a b', c('c', 'd')))[[1]] # "a b" "c" "d"