gensim/corpora/dictionary.py
Killed 27 out of 70 mutantsSurvived
Survived mutation testing. These mutants show holes in your test suite.Mutant 142
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -19,7 +19,7 @@
from six import PY3, iteritems, iterkeys, itervalues, string_types
from six.moves import zip, range
-if sys.version_info[0] >= 3:
+if sys.version_info[1] >= 3:
unicode = str
Mutant 147
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -50,7 +50,7 @@
words per document over the entire corpus).
"""
- def __init__(self, documents=None, prune_at=2000000):
+ def __init__(self, documents=None, prune_at=2000001):
"""
Parameters
Mutant 152
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -80,7 +80,7 @@
self.cfs = {}
self.dfs = {}
- self.num_docs = 0
+ self.num_docs = 1
self.num_pos = 0
self.num_nnz = 0
Mutant 154
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -81,7 +81,7 @@
self.dfs = {}
self.num_docs = 0
- self.num_pos = 0
+ self.num_pos = 1
self.num_nnz = 0
if documents is not None:
Mutant 156
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -82,7 +82,7 @@
self.num_docs = 0
self.num_pos = 0
- self.num_nnz = 0
+ self.num_nnz = 1
if documents is not None:
self.add_documents(documents, prune_at=prune_at)
Mutant 161
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -118,7 +118,7 @@
if PY3:
# restore Py2-style dict API
- iterkeys = __iter__
+ iterkeys = None
def iteritems(self):
return self.items()
Mutant 162
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -152,7 +152,6 @@
some_keys = list(itertools.islice(iterkeys(self.token2id), 5))
return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '')
- @staticmethod
def from_documents(documents):
"""Create :class:`~gensim.corpora.dictionary.Dictionary` from `documents`.
Mutant 163
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -171,7 +171,7 @@
"""
return Dictionary(documents=documents)
- def add_documents(self, documents, prune_at=2000000):
+ def add_documents(self, documents, prune_at=2000001):
"""Update dictionary from a collection of `documents`.
Parameters
Mutant 164
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -200,7 +200,7 @@
"""
for docno, document in enumerate(documents):
# log progress & run a regular check for pruning, once every 10k docs
- if docno % 10000 == 0:
+ if docno / 10000 == 0:
if prune_at is not None and len(self) > prune_at:
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
logger.info("adding document #%i to %s", docno, self)
Mutant 165
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -200,7 +200,7 @@
"""
for docno, document in enumerate(documents):
# log progress & run a regular check for pruning, once every 10k docs
- if docno % 10000 == 0:
+ if docno % 10001 == 0:
if prune_at is not None and len(self) > prune_at:
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
logger.info("adding document #%i to %s", docno, self)
Mutant 166
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -200,7 +200,7 @@
"""
for docno, document in enumerate(documents):
# log progress & run a regular check for pruning, once every 10k docs
- if docno % 10000 == 0:
+ if docno % 10000 != 0:
if prune_at is not None and len(self) > prune_at:
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
logger.info("adding document #%i to %s", docno, self)
Mutant 167
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -200,7 +200,7 @@
"""
for docno, document in enumerate(documents):
# log progress & run a regular check for pruning, once every 10k docs
- if docno % 10000 == 0:
+ if docno % 10000 == 1:
if prune_at is not None and len(self) > prune_at:
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
logger.info("adding document #%i to %s", docno, self)
Mutant 168
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -201,7 +201,7 @@
for docno, document in enumerate(documents):
# log progress & run a regular check for pruning, once every 10k docs
if docno % 10000 == 0:
- if prune_at is not None and len(self) > prune_at:
+ if prune_at is None and len(self) > prune_at:
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
logger.info("adding document #%i to %s", docno, self)
Mutant 169
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -201,7 +201,7 @@
for docno, document in enumerate(documents):
# log progress & run a regular check for pruning, once every 10k docs
if docno % 10000 == 0:
- if prune_at is not None and len(self) > prune_at:
+ if prune_at is not None and len(self) >= prune_at:
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
logger.info("adding document #%i to %s", docno, self)
Mutant 170
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -201,7 +201,7 @@
for docno, document in enumerate(documents):
# log progress & run a regular check for pruning, once every 10k docs
if docno % 10000 == 0:
- if prune_at is not None and len(self) > prune_at:
+ if prune_at is not None or len(self) > prune_at:
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
logger.info("adding document #%i to %s", docno, self)
Mutant 171
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -203,7 +203,7 @@
if docno % 10000 == 0:
if prune_at is not None and len(self) > prune_at:
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
- logger.info("adding document #%i to %s", docno, self)
+ logger.info("XXadding document #%i to %sXX", docno, self)
# update Dictionary with the document
self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
Mutant 174
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -213,7 +213,7 @@
self, self.num_docs, self.num_pos
)
- def doc2bow(self, document, allow_update=False, return_missing=False):
+ def doc2bow(self, document, allow_update=True, return_missing=False):
"""Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples.
Parameters
Mutant 177
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -251,7 +251,7 @@
# Construct (word, frequency) mapping.
counter = defaultdict(int)
for w in document:
- counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
+ counter[w if isinstance(w, unicode) else unicode(w, 'XXutf-8XX')] += 1
token2id = self.token2id
if allow_update or return_missing:
Mutant 178
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -251,7 +251,7 @@
# Construct (word, frequency) mapping.
counter = defaultdict(int)
for w in document:
- counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
+ counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] = 1
token2id = self.token2id
if allow_update or return_missing:
Mutant 179
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -251,7 +251,7 @@
# Construct (word, frequency) mapping.
counter = defaultdict(int)
for w in document:
- counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
+ counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] -= 1
token2id = self.token2id
if allow_update or return_missing:
Mutant 180
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -251,7 +251,7 @@
# Construct (word, frequency) mapping.
counter = defaultdict(int)
for w in document:
- counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
+ counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 2
token2id = self.token2id
if allow_update or return_missing:
Mutant 189
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -264,7 +264,7 @@
result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
if allow_update:
- self.num_docs += 1
+ self.num_docs = 1
self.num_pos += sum(itervalues(counter))
self.num_nnz += len(result)
# keep track of document and collection frequencies
Mutant 190
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -264,7 +264,7 @@
result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
if allow_update:
- self.num_docs += 1
+ self.num_docs -= 1
self.num_pos += sum(itervalues(counter))
self.num_nnz += len(result)
# keep track of document and collection frequencies
Mutant 191
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -264,7 +264,7 @@
result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
if allow_update:
- self.num_docs += 1
+ self.num_docs += 2
self.num_pos += sum(itervalues(counter))
self.num_nnz += len(result)
# keep track of document and collection frequencies
Mutant 192
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -265,7 +265,7 @@
if allow_update:
self.num_docs += 1
- self.num_pos += sum(itervalues(counter))
+ self.num_pos = sum(itervalues(counter))
self.num_nnz += len(result)
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
Mutant 193
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -265,7 +265,7 @@
if allow_update:
self.num_docs += 1
- self.num_pos += sum(itervalues(counter))
+ self.num_pos -= sum(itervalues(counter))
self.num_nnz += len(result)
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
Mutant 194
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -266,7 +266,7 @@
if allow_update:
self.num_docs += 1
self.num_pos += sum(itervalues(counter))
- self.num_nnz += len(result)
+ self.num_nnz = len(result)
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
Mutant 195
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -266,7 +266,7 @@
if allow_update:
self.num_docs += 1
self.num_pos += sum(itervalues(counter))
- self.num_nnz += len(result)
+ self.num_nnz -= len(result)
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
Mutant 196
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -269,7 +269,7 @@
self.num_nnz += len(result)
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
- self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
+ self.cfs[tokenid] = self.cfs.get(tokenid, 1) + freq
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
# return tokenids, in ascending id order
Mutant 197
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -269,7 +269,7 @@
self.num_nnz += len(result)
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
- self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
+ self.cfs[tokenid] = self.cfs.get(tokenid, 0) - freq
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
# return tokenids, in ascending id order
Mutant 198
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -269,7 +269,7 @@
self.num_nnz += len(result)
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
- self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
+ self.cfs[tokenid] = None
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
# return tokenids, in ascending id order
Mutant 199
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -270,7 +270,7 @@
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
- self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
+ self.dfs[tokenid] = self.dfs.get(tokenid, 1) + 1
# return tokenids, in ascending id order
result = sorted(iteritems(result))
Mutant 200
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -270,7 +270,7 @@
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
- self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
+ self.dfs[tokenid] = self.dfs.get(tokenid, 0) - 1
# return tokenids, in ascending id order
result = sorted(iteritems(result))
Mutant 201
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -270,7 +270,7 @@
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
- self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
+ self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 2
# return tokenids, in ascending id order
result = sorted(iteritems(result))
Mutant 202
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -270,7 +270,7 @@
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
- self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
+ self.dfs[tokenid] = None
# return tokenids, in ascending id order
result = sorted(iteritems(result))
Mutant 204
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -279,7 +279,7 @@
else:
return result
- def doc2idx(self, document, unknown_word_index=-1):
+ def doc2idx(self, document, unknown_word_index=+1):
"""Convert `document` (a list of words) into a list of indexes = list of `token_id`.
Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`.
Mutant 205
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -279,7 +279,7 @@
else:
return result
- def doc2idx(self, document, unknown_word_index=-1):
+ def doc2idx(self, document, unknown_word_index=-2):
"""Convert `document` (a list of words) into a list of indexes = list of `token_id`.
Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`.
Mutant 206
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -313,7 +313,7 @@
document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
return [self.token2id.get(word, unknown_word_index) for word in document]
- def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
+ def filter_extremes(self, no_below=6, no_above=0.5, keep_n=100000, keep_tokens=None):
"""Filter out tokens in the dictionary by their frequency.
Parameters
Mutant 207
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -313,7 +313,7 @@
document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
return [self.token2id.get(word, unknown_word_index) for word in document]
- def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
+ def filter_extremes(self, no_below=5, no_above=1.5, keep_n=100000, keep_tokens=None):
"""Filter out tokens in the dictionary by their frequency.
Parameters
Mutant 208
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -313,7 +313,7 @@
document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
return [self.token2id.get(word, unknown_word_index) for word in document]
- def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
+ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100001, keep_tokens=None):
"""Filter out tokens in the dictionary by their frequency.
Parameters
Mutant 209
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -475,7 +475,7 @@
self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)}
self.cfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.cfs)}
- def save_as_text(self, fname, sort_by_word=True):
+ def save_as_text(self, fname, sort_by_word=False):
"""Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.
Parameters
Mutant 210
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -642,7 +642,6 @@
len(possible_ids) > 0 else len(self.token2id) - 1
self.id2token = {} # Make sure that id2token is updated according to special tokens.
- @staticmethod
def load_from_text(fname):
"""Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file.
Mutant 211
--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -698,7 +698,6 @@
result.dfs[wordid] = int(docfreq)
return result
- @staticmethod
def from_corpus(corpus, id2word=None):
"""Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus.