gensim/corpora/dictionary.py

Killed 27 out of 70 mutants

Survived

Survived mutation testing. These mutants show holes in your test suite.

Mutant 142

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -19,7 +19,7 @@
 from six import PY3, iteritems, iterkeys, itervalues, string_types
 from six.moves import zip, range
 
-if sys.version_info[0] >= 3:
+if sys.version_info[1] >= 3:
     unicode = str
 
 

Mutant 147

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -50,7 +50,7 @@
         words per document over the entire corpus).
 
     """
-    def __init__(self, documents=None, prune_at=2000000):
+    def __init__(self, documents=None, prune_at=2000001):
         """
 
         Parameters

Mutant 152

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -80,7 +80,7 @@
         self.cfs = {}
         self.dfs = {}
 
-        self.num_docs = 0
+        self.num_docs = 1
         self.num_pos = 0
         self.num_nnz = 0
 

Mutant 154

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -81,7 +81,7 @@
         self.dfs = {}
 
         self.num_docs = 0
-        self.num_pos = 0
+        self.num_pos = 1
         self.num_nnz = 0
 
         if documents is not None:

Mutant 156

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -82,7 +82,7 @@
 
         self.num_docs = 0
         self.num_pos = 0
-        self.num_nnz = 0
+        self.num_nnz = 1
 
         if documents is not None:
             self.add_documents(documents, prune_at=prune_at)

Mutant 161

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -118,7 +118,7 @@
 
     if PY3:
         # restore Py2-style dict API
-        iterkeys = __iter__
+        iterkeys = None
 
         def iteritems(self):
             return self.items()

Mutant 162

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -152,7 +152,6 @@
         some_keys = list(itertools.islice(iterkeys(self.token2id), 5))
         return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '')
 
-    @staticmethod
     def from_documents(documents):
         """Create :class:`~gensim.corpora.dictionary.Dictionary` from `documents`.
 

Mutant 163

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -171,7 +171,7 @@
         """
         return Dictionary(documents=documents)
 
-    def add_documents(self, documents, prune_at=2000000):
+    def add_documents(self, documents, prune_at=2000001):
         """Update dictionary from a collection of `documents`.
 
         Parameters

Mutant 164

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -200,7 +200,7 @@
         """
         for docno, document in enumerate(documents):
             # log progress & run a regular check for pruning, once every 10k docs
-            if docno % 10000 == 0:
+            if docno / 10000 == 0:
                 if prune_at is not None and len(self) > prune_at:
                     self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
                 logger.info("adding document #%i to %s", docno, self)

Mutant 165

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -200,7 +200,7 @@
         """
         for docno, document in enumerate(documents):
             # log progress & run a regular check for pruning, once every 10k docs
-            if docno % 10000 == 0:
+            if docno % 10001 == 0:
                 if prune_at is not None and len(self) > prune_at:
                     self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
                 logger.info("adding document #%i to %s", docno, self)

Mutant 166

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -200,7 +200,7 @@
         """
         for docno, document in enumerate(documents):
             # log progress & run a regular check for pruning, once every 10k docs
-            if docno % 10000 == 0:
+            if docno % 10000 != 0:
                 if prune_at is not None and len(self) > prune_at:
                     self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
                 logger.info("adding document #%i to %s", docno, self)

Mutant 167

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -200,7 +200,7 @@
         """
         for docno, document in enumerate(documents):
             # log progress & run a regular check for pruning, once every 10k docs
-            if docno % 10000 == 0:
+            if docno % 10000 == 1:
                 if prune_at is not None and len(self) > prune_at:
                     self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
                 logger.info("adding document #%i to %s", docno, self)

Mutant 168

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -201,7 +201,7 @@
         for docno, document in enumerate(documents):
             # log progress & run a regular check for pruning, once every 10k docs
             if docno % 10000 == 0:
-                if prune_at is not None and len(self) > prune_at:
+                if prune_at is  None and len(self) > prune_at:
                     self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
                 logger.info("adding document #%i to %s", docno, self)
 

Mutant 169

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -201,7 +201,7 @@
         for docno, document in enumerate(documents):
             # log progress & run a regular check for pruning, once every 10k docs
             if docno % 10000 == 0:
-                if prune_at is not None and len(self) > prune_at:
+                if prune_at is not None and len(self) >= prune_at:
                     self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
                 logger.info("adding document #%i to %s", docno, self)
 

Mutant 170

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -201,7 +201,7 @@
         for docno, document in enumerate(documents):
             # log progress & run a regular check for pruning, once every 10k docs
             if docno % 10000 == 0:
-                if prune_at is not None and len(self) > prune_at:
+                if prune_at is not None or len(self) > prune_at:
                     self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
                 logger.info("adding document #%i to %s", docno, self)
 

Mutant 171

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -203,7 +203,7 @@
             if docno % 10000 == 0:
                 if prune_at is not None and len(self) > prune_at:
                     self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
-                logger.info("adding document #%i to %s", docno, self)
+                logger.info("XXadding document #%i to %sXX", docno, self)
 
             # update Dictionary with the document
             self.doc2bow(document, allow_update=True)  # ignore the result, here we only care about updating token ids

Mutant 173

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -209,7 +209,7 @@
             self.doc2bow(document, allow_update=True)  # ignore the result, here we only care about updating token ids
 
         logger.info(
-            "built %s from %i documents (total %i corpus positions)",
+            "XXbuilt %s from %i documents (total %i corpus positions)XX",
             self, self.num_docs, self.num_pos
         )
 

Mutant 174

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -213,7 +213,7 @@
             self, self.num_docs, self.num_pos
         )
 
-    def doc2bow(self, document, allow_update=False, return_missing=False):
+    def doc2bow(self, document, allow_update=True, return_missing=False):
         """Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples.
 
         Parameters

Mutant 177

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -251,7 +251,7 @@
         # Construct (word, frequency) mapping.
         counter = defaultdict(int)
         for w in document:
-            counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
+            counter[w if isinstance(w, unicode) else unicode(w, 'XXutf-8XX')] += 1
 
         token2id = self.token2id
         if allow_update or return_missing:

Mutant 178

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -251,7 +251,7 @@
         # Construct (word, frequency) mapping.
         counter = defaultdict(int)
         for w in document:
-            counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
+            counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] = 1
 
         token2id = self.token2id
         if allow_update or return_missing:

Mutant 179

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -251,7 +251,7 @@
         # Construct (word, frequency) mapping.
         counter = defaultdict(int)
         for w in document:
-            counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
+            counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] -= 1
 
         token2id = self.token2id
         if allow_update or return_missing:

Mutant 180

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -251,7 +251,7 @@
         # Construct (word, frequency) mapping.
         counter = defaultdict(int)
         for w in document:
-            counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
+            counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 2
 
         token2id = self.token2id
         if allow_update or return_missing:

Mutant 189

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -264,7 +264,7 @@
         result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
 
         if allow_update:
-            self.num_docs += 1
+            self.num_docs = 1
             self.num_pos += sum(itervalues(counter))
             self.num_nnz += len(result)
             # keep track of document and collection frequencies

Mutant 190

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -264,7 +264,7 @@
         result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
 
         if allow_update:
-            self.num_docs += 1
+            self.num_docs -= 1
             self.num_pos += sum(itervalues(counter))
             self.num_nnz += len(result)
             # keep track of document and collection frequencies

Mutant 191

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -264,7 +264,7 @@
         result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
 
         if allow_update:
-            self.num_docs += 1
+            self.num_docs += 2
             self.num_pos += sum(itervalues(counter))
             self.num_nnz += len(result)
             # keep track of document and collection frequencies

Mutant 192

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -265,7 +265,7 @@
 
         if allow_update:
             self.num_docs += 1
-            self.num_pos += sum(itervalues(counter))
+            self.num_pos = sum(itervalues(counter))
             self.num_nnz += len(result)
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):

Mutant 193

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -265,7 +265,7 @@
 
         if allow_update:
             self.num_docs += 1
-            self.num_pos += sum(itervalues(counter))
+            self.num_pos -= sum(itervalues(counter))
             self.num_nnz += len(result)
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):

Mutant 194

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -266,7 +266,7 @@
         if allow_update:
             self.num_docs += 1
             self.num_pos += sum(itervalues(counter))
-            self.num_nnz += len(result)
+            self.num_nnz = len(result)
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):
                 self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq

Mutant 195

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -266,7 +266,7 @@
         if allow_update:
             self.num_docs += 1
             self.num_pos += sum(itervalues(counter))
-            self.num_nnz += len(result)
+            self.num_nnz -= len(result)
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):
                 self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq

Mutant 196

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -269,7 +269,7 @@
             self.num_nnz += len(result)
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):
-                self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
+                self.cfs[tokenid] = self.cfs.get(tokenid, 1) + freq
                 self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
 
         # return tokenids, in ascending id order

Mutant 197

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -269,7 +269,7 @@
             self.num_nnz += len(result)
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):
-                self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
+                self.cfs[tokenid] = self.cfs.get(tokenid, 0) - freq
                 self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
 
         # return tokenids, in ascending id order

Mutant 198

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -269,7 +269,7 @@
             self.num_nnz += len(result)
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):
-                self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
+                self.cfs[tokenid] = None
                 self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
 
         # return tokenids, in ascending id order

Mutant 199

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -270,7 +270,7 @@
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):
                 self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
-                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
+                self.dfs[tokenid] = self.dfs.get(tokenid, 1) + 1
 
         # return tokenids, in ascending id order
         result = sorted(iteritems(result))

Mutant 200

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -270,7 +270,7 @@
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):
                 self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
-                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
+                self.dfs[tokenid] = self.dfs.get(tokenid, 0) - 1
 
         # return tokenids, in ascending id order
         result = sorted(iteritems(result))

Mutant 201

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -270,7 +270,7 @@
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):
                 self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
-                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
+                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 2
 
         # return tokenids, in ascending id order
         result = sorted(iteritems(result))

Mutant 202

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -270,7 +270,7 @@
             # keep track of document and collection frequencies
             for tokenid, freq in iteritems(result):
                 self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
-                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
+                self.dfs[tokenid] = None
 
         # return tokenids, in ascending id order
         result = sorted(iteritems(result))

Mutant 204

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -279,7 +279,7 @@
         else:
             return result
 
-    def doc2idx(self, document, unknown_word_index=-1):
+    def doc2idx(self, document, unknown_word_index=+1):
         """Convert `document` (a list of words) into a list of indexes = list of `token_id`.
         Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`.
 

Mutant 206

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -313,7 +313,7 @@
         document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
         return [self.token2id.get(word, unknown_word_index) for word in document]
 
-    def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
+    def filter_extremes(self, no_below=6, no_above=0.5, keep_n=100000, keep_tokens=None):
         """Filter out tokens in the dictionary by their frequency.
 
         Parameters

Mutant 207

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -313,7 +313,7 @@
         document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
         return [self.token2id.get(word, unknown_word_index) for word in document]
 
-    def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
+    def filter_extremes(self, no_below=5, no_above=1.5, keep_n=100000, keep_tokens=None):
         """Filter out tokens in the dictionary by their frequency.
 
         Parameters

Mutant 208

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -313,7 +313,7 @@
         document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
         return [self.token2id.get(word, unknown_word_index) for word in document]
 
-    def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
+    def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100001, keep_tokens=None):
         """Filter out tokens in the dictionary by their frequency.
 
         Parameters

Mutant 209

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -475,7 +475,7 @@
         self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)}
         self.cfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.cfs)}
 
-    def save_as_text(self, fname, sort_by_word=True):
+    def save_as_text(self, fname, sort_by_word=False):
         """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.
 
         Parameters

Mutant 210

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -642,7 +642,6 @@
                                        len(possible_ids) > 0 else len(self.token2id) - 1
         self.id2token = {}  # Make sure that id2token is updated according to special tokens.
 
-    @staticmethod
     def load_from_text(fname):
         """Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file.
 

Mutant 211

--- gensim/corpora/dictionary.py
+++ gensim/corpora/dictionary.py
@@ -698,7 +698,6 @@
                 result.dfs[wordid] = int(docfreq)
         return result
 
-    @staticmethod
     def from_corpus(corpus, id2word=None):
         """Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus.