Test objects

txt <- c(d1 = "a b c d e g h",  d2 = "a b e g h i j")
toks <- tokens(txt)
coll_bi <- textstat_collocations(toks, method = "lr", max_size = 2)

Issues with concatenators

Currently, users can chose concatenators of ngrams, but this is making consistent handling of tokens object difficult.

attr(tokens_ngrams(toks, n = 2), 'types')
## [1] "a b" "b c" "c d" "d e" "e g" "g h" "b e" "h i" "i j"
attr(tokens_ngrams(toks, n = 2, concatenator = " "), 'types')
## Warning in tokens_concatenate(x, concatenator): You are making your life
## difficult by creating unigrams with whitespaces
## [1] "a b" "b c" "c d" "d e" "e g" "g h" "b e" "h i" "i j"

So, concatenators of ngrams should be always the same character internally, making concatenator = NULL by defult. The internal concatenator is a whitespace (or Unicode control character).

Concatenators should be chosen when tokens are printed:

print(tokens_ngrams(toks, n = 2), concatenator = ' ')
# tokens from 2 documents.
# d1 :
# [1] "a b" "b c" "c d" "d e" "e g" "g h"
# 
# d2 :
# [1] "a b" "b e" "e g" "g h" "h i" "i j"

print(tokens_ngrams(toks, n = 2), concatenator = '_')
# tokens from 2 documents.
# d1 :
# [1] "a_b" "b_c" "c_d" "d_e" "e_g" "g_h"
# 
# d2 :
# [1] "a_b" "b_e" "e_g" "g_h" "h_i" "i_j"

print(tokens_ngrams(toks, n = 2), concatenator = '+')
# tokens from 2 documents.
# d1 :
# [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h"
# 
# d2 :
# [1] "a+b" "b+e" "e+g" "g+h" "h+i" "i+j"

Similary,

as.list(tokens_ngrams(toks, n = 2), concatenator = ' ')
# $d1
# [1] "a b" "b c" "c d" "d e" "e g" "g h"
# 
# $d2
# [1] "a b" "b e" "e g" "g h" "h i" "i j"

as.list(tokens_ngrams(toks, n = 2), concatenator = '_')
# $d1
# [1] "a_b" "b_c" "c_d" "d_e" "e_g" "g_h"
# 
# $d2
# [1] "a_b" "b_e" "e_g" "g_h" "h_i" "i_j"

as.list(tokens_ngrams(toks, n = 2), concatenator = '+')
# $d1
# [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h"
# 
# $d2
# [1] "a+b" "b+e" "e+g" "g+h" "h+i" "i+j"

We will then create a new function tokens_concatenate() (or tokens_unigrams()), which converts ngram tokens into unigrams by changing internal concatenator to user’s concatenator, and reset the ngrams value.

attr(tokens_ngrams(toks, n = 2), 'ngrams') # 2
## [1] 2
attr(tokens_ngrams(toks, n = 2), 'types')
## [1] "a b" "b c" "c d" "d e" "e g" "g h" "b e" "h i" "i j"
# [1] "a b" "b c" "c d" "d e" "e g" "g h" "b e" "h i" "i j"

attr(quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2)), 'ngrams') # 1
## Warning in quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2)): You
## are making your life difficult by creating unigrams with whitespaces
## [1] 1
attr(quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2), concatenator = '+'), 'types')
## [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h" "b+e" "h+i" "i+j"
# [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h" "b+e" "h+i" "i+j"

We would allow tokens concateantion to happen within tokens_ngrams() if concatenator is not NULL.

attr(tokens_ngrams(toks, n = 2, concatenator = '+'), 'ngrams') # 1
## [1] 1
attr(tokens_ngrams(toks, n = 2, concatenator = '+'), 'types')
## [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h" "b+e" "h+i" "i+j"
# [1] "a+b" "b+c" "c+d" "d+e" "e+g" "g+h" "b+e" "h+i" "i+j"

Selection on ngram tokens

Above changes will make behaviours of tokens_select() and dfm_select() more consistent, treating ngrams as sequences in dfm_select() and spliting features input to select_tokens() by whitespaces only when tokens are unigrams.

toks_bi  <- tokens_ngrams(toks, n = 2) 
toks_uni <- quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2), concatenator = '+')

# with bigrams
featnames(dfm_select(dfm(toks_bi), features = 'c d'))     #     "c d"
## [1] "c d"
tokens_select(           toks_bi,  features = 'c d')[[1]] # [1] "c d"
## [1] "c d"
# with bigrams as unigrams
featnames(dfm_select(dfm(toks_uni), features = 'c d'))     # character(0) (and warning)
## character(0)
tokens_select(           toks_uni,  features = 'c d')[[1]] # character(0)
## character(0)
featnames(dfm_select(dfm(toks_uni), features = 'c+d'))     # "c+d"
## [1] "c+d"
tokens_select(           toks_uni,  features = 'c+d')[[1]] # "c+d"
## [1] "c+d"

Users are free to concatenate ngrams with a whitespace using tokens_concatenate(), but dfm_select() and tokens_select() will behave differently (as per current).

toks_uni2 <- quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2), concatenator = ' ')
## Warning in quanteda:::tokens_concatenate(tokens_ngrams(toks, n = 2),
## concatenator = " "): You are making your life difficult by creating
## unigrams with whitespaces
featnames(dfm_select(dfm(toks_uni2), features = 'c d'))           # "c d"
## [1] "c d"
tokens_select(           toks_uni2,  features = 'c d')[[1]]       # character(0)
## character(0)

In this densign, users need to use list() (or sequences()) in only special cases.

# select unigrams with a whitespace
tokens_select(          toks_uni2,  features = list('c d'))[[1]] # "c d"
## [1] "c d"

Selection on compounded tokens

Based on the same priciple as ngrams, we will set concatenator = NULL by default in tokens_compound(). Collocation and dictionary object works consistently as they are whitespace separated ngrams.

attr(tokens_compound(toks, coll_bi), 'types') 
#[1] "c"   "d"   "i"   "j"   "a b" "e g" "g h"

attr(quanteda:::tokens_concatenate(tokens_compound(toks, coll_bi), concatenator = '_'), 'types') 
#[1] "c"   "d"   "i"   "j"   "a_b" "e_g" "g_h"

attr(quanteda:::tokens_concatenate(tokens_compound(toks, coll_bi), concatenator = ''), 'types')
#[1] "c"   "d"   "i"   "j"   "ab" "eg" "gh"

Compounded tokens are treated as ngram tokens unless tokens_concatenate() is applied.

toks_comp <- tokens_compound(toks, coll_bi)
attr(toks_comp, 'ngrams') # 1:2
attr(quanteda:::tokens_concatenate(toks_comp, concatenator = '_'), 'ngrams') # 1

dfm_select() and tokens_select() select the same feature.

featnames(dfm_select(dfm(toks_comp)), c('a b', 'c d'))      # "a b"
tokens_select(           toks_comp,   c('a b', 'c d'))[[1]] # "a b"

Again, if users concatenate compounds with a whitespace, dfm_select() and tokens_select() will behave differently, but this is an intentional act.

toks_comp2 <- quanteda:::tokens_concatenate(tokens_compound(toks, coll_bi), concatenator = ' ')
featnames(dfm_select(dfm(toks_comp2)), c('a b', 'c d'))               # "a b"
tokens_select(           toks_comp2,   c('a b', 'c d'))[[1]]          # "c" "d"
tokens_select(           toks_comp2,   list('a b', 'c d'))[[1]]       # "a b"
tokens_select(           toks_comp2,   list('a b', c('c', 'd')))[[1]] # "a b" "c" "d"