gensim/corpora/wikicorpus.py
Killed 0 out of 74 mutantsSurvived
Survived mutation testing. These mutants show holes in your test suite.Mutant 23
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -37,7 +37,7 @@
from six import raise_from
-logger = logging.getLogger(__name__)
+logger = None
ARTICLE_MIN_WORDS = 50
"""Ignore shorter articles (after full preprocessing)."""
Mutant 24
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -39,7 +39,7 @@
logger = logging.getLogger(__name__)
-ARTICLE_MIN_WORDS = 50
+ARTICLE_MIN_WORDS = 51
"""Ignore shorter articles (after full preprocessing)."""
# default thresholds for lengths of individual tokens
Mutant 25
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -39,7 +39,7 @@
logger = logging.getLogger(__name__)
-ARTICLE_MIN_WORDS = 50
+ARTICLE_MIN_WORDS = None
"""Ignore shorter articles (after full preprocessing)."""
# default thresholds for lengths of individual tokens
Mutant 26
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -43,7 +43,7 @@
"""Ignore shorter articles (after full preprocessing)."""
# default thresholds for lengths of individual tokens
-TOKEN_MIN_LEN = 2
+TOKEN_MIN_LEN = 3
TOKEN_MAX_LEN = 15
RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
Mutant 27
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -43,7 +43,7 @@
"""Ignore shorter articles (after full preprocessing)."""
# default thresholds for lengths of individual tokens
-TOKEN_MIN_LEN = 2
+TOKEN_MIN_LEN = None
TOKEN_MAX_LEN = 15
RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
Mutant 28
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -44,7 +44,7 @@
# default thresholds for lengths of individual tokens
TOKEN_MIN_LEN = 2
-TOKEN_MAX_LEN = 15
+TOKEN_MAX_LEN = 16
RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
"""Comments."""
Mutant 29
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -44,7 +44,7 @@
# default thresholds for lengths of individual tokens
TOKEN_MIN_LEN = 2
-TOKEN_MAX_LEN = 15
+TOKEN_MAX_LEN = None
RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
"""Comments."""
Mutant 30
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -46,7 +46,7 @@
TOKEN_MIN_LEN = 2
TOKEN_MAX_LEN = 15
-RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
+RE_P0 = re.compile(r'XXXX', re.DOTALL | re.UNICODE)
"""Comments."""
RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL | re.UNICODE)
"""Footnotes."""
Mutant 31
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -46,7 +46,7 @@
TOKEN_MIN_LEN = 2
TOKEN_MAX_LEN = 15
-RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
+RE_P0 = re.compile(r'', re.DOTALL & re.UNICODE)
"""Comments."""
RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL | re.UNICODE)
"""Footnotes."""
Mutant 32
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -46,7 +46,7 @@
TOKEN_MIN_LEN = 2
TOKEN_MAX_LEN = 15
-RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
+RE_P0 = None
"""Comments."""
RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL | re.UNICODE)
"""Footnotes."""
Mutant 33
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -48,7 +48,7 @@
RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
"""Comments."""
-RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL | re.UNICODE)
+RE_P1 = re.compile(r'XX[ ].*?)(]|/>)XX', re.DOTALL | re.UNICODE)
"""Footnotes."""
RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
"""Links to languages."""
Mutant 34
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -48,7 +48,7 @@
RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
"""Comments."""
-RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL | re.UNICODE)
+RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL & re.UNICODE)
"""Footnotes."""
RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
"""Links to languages."""
Mutant 35
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -48,7 +48,7 @@
RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE)
"""Comments."""
-RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL | re.UNICODE)
+RE_P1 = None
"""Footnotes."""
RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
"""Links to languages."""
Mutant 36
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -50,7 +50,7 @@
"""Comments."""
RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL | re.UNICODE)
"""Footnotes."""
-RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
+RE_P2 = re.compile(r'XX(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$XX', re.UNICODE)
"""Links to languages."""
RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
Mutant 37
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -50,7 +50,7 @@
"""Comments."""
RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL | re.UNICODE)
"""Footnotes."""
-RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
+RE_P2 = None
"""Links to languages."""
RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
Mutant 38
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -52,7 +52,7 @@
"""Footnotes."""
RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
"""Links to languages."""
-RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
+RE_P3 = re.compile(r'XX{{([^}{]*)}}XX', re.DOTALL | re.UNICODE)
"""Template."""
RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
Mutant 39
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -52,7 +52,7 @@
"""Footnotes."""
RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
"""Links to languages."""
-RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
+RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL & re.UNICODE)
"""Template."""
RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
Mutant 40
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -52,7 +52,7 @@
"""Footnotes."""
RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
"""Links to languages."""
-RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
+RE_P3 = None
"""Template."""
RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
Mutant 41
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -54,7 +54,7 @@
"""Links to languages."""
RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
-RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
+RE_P4 = re.compile(r'XX{{([^}]*)}}XX', re.DOTALL | re.UNICODE)
"""Template."""
RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
"""Remove URL, keep description."""
Mutant 42
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -54,7 +54,7 @@
"""Links to languages."""
RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
-RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
+RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL & re.UNICODE)
"""Template."""
RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
"""Remove URL, keep description."""
Mutant 43
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -54,7 +54,7 @@
"""Links to languages."""
RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
-RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
+RE_P4 = None
"""Template."""
RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
"""Remove URL, keep description."""
Mutant 44
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -56,7 +56,7 @@
"""Template."""
RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
-RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
+RE_P5 = re.compile(r'XX\[(\w+):\/\/(.*?)(( (.*?))|())\]XX', re.UNICODE)
"""Remove URL, keep description."""
RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)
"""Simplify links, keep description."""
Mutant 45
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -56,7 +56,7 @@
"""Template."""
RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
-RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
+RE_P5 = None
"""Remove URL, keep description."""
RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)
"""Simplify links, keep description."""
Mutant 46
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -58,7 +58,7 @@
"""Template."""
RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
"""Remove URL, keep description."""
-RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)
+RE_P6 = re.compile(r'XX\[([^][]*)\|([^][]*)\]XX', re.DOTALL | re.UNICODE)
"""Simplify links, keep description."""
RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of images."""
Mutant 47
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -58,7 +58,7 @@
"""Template."""
RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
"""Remove URL, keep description."""
-RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)
+RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL & re.UNICODE)
"""Simplify links, keep description."""
RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of images."""
Mutant 48
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -58,7 +58,7 @@
"""Template."""
RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
"""Remove URL, keep description."""
-RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)
+RE_P6 = None
"""Simplify links, keep description."""
RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of images."""
Mutant 49
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -60,7 +60,7 @@
"""Remove URL, keep description."""
RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)
"""Simplify links, keep description."""
-RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
+RE_P7 = re.compile(r'XX\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]XX', re.UNICODE)
"""Keep description of images."""
RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of files."""
Mutant 50
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -60,7 +60,7 @@
"""Remove URL, keep description."""
RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)
"""Simplify links, keep description."""
-RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
+RE_P7 = None
"""Keep description of images."""
RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of files."""
Mutant 51
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -62,7 +62,7 @@
"""Simplify links, keep description."""
RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of images."""
-RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
+RE_P8 = re.compile(r'XX\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]XX', re.UNICODE)
"""Keep description of files."""
RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE)
"""External links."""
Mutant 52
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -62,7 +62,7 @@
"""Simplify links, keep description."""
RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of images."""
-RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
+RE_P8 = None
"""Keep description of files."""
RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE)
"""External links."""
Mutant 53
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -64,7 +64,7 @@
"""Keep description of images."""
RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of files."""
-RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE)
+RE_P9 = re.compile(r'XX ].*?)(|/>)XX', re.DOTALL | re.UNICODE)
"""External links."""
RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE)
"""Math content."""
Mutant 54
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -64,7 +64,7 @@
"""Keep description of images."""
RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of files."""
-RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE)
+RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL & re.UNICODE)
"""External links."""
RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE)
"""Math content."""
Mutant 55
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -64,7 +64,7 @@
"""Keep description of images."""
RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of files."""
-RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE)
+RE_P9 = None
"""External links."""
RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE)
"""Math content."""
Mutant 56
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -66,7 +66,7 @@
"""Keep description of files."""
RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE)
"""External links."""
-RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE)
+RE_P10 = re.compile(r'XX|/>)XX', re.DOTALL | re.UNICODE)
"""Math content."""
RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
"""All other tags."""
Mutant 57
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -66,7 +66,7 @@
"""Keep description of files."""
RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE)
"""External links."""
-RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE)
+RE_P10 = re.compile(r'|/>)', re.DOTALL & re.UNICODE)
"""Math content."""
RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
"""All other tags."""
Mutant 58
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -66,7 +66,7 @@
"""Keep description of files."""
RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE)
"""External links."""
-RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE)
+RE_P10 = None
"""Math content."""
RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
"""All other tags."""
Mutant 59
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -68,7 +68,7 @@
"""External links."""
RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE)
"""Math content."""
-RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
+RE_P11 = re.compile(r'XX<(.*?)>XX', re.DOTALL | re.UNICODE)
"""All other tags."""
RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE)
"""Table formatting."""
Mutant 60
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -68,7 +68,7 @@
"""External links."""
RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE)
"""Math content."""
-RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
+RE_P11 = re.compile(r'<(.*?)>', re.DOTALL & re.UNICODE)
"""All other tags."""
RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE)
"""Table formatting."""
Mutant 61
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -68,7 +68,7 @@
"""External links."""
RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE)
"""Math content."""
-RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
+RE_P11 = None
"""All other tags."""
RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE)
"""Table formatting."""
Mutant 62
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -70,7 +70,7 @@
"""Math content."""
RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
"""All other tags."""
-RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE)
+RE_P12 = re.compile(r'XX(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)XX', re.UNICODE)
"""Table formatting."""
RE_P13 = re.compile(r'(?<=(\n[ ])|(\n\n)|([ ]{2})|(.\n)|(.\t))(\||\!)([^[\]\n]*?\|)*', re.UNICODE)
"""Table cell formatting."""
Mutant 63
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -70,7 +70,7 @@
"""Math content."""
RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
"""All other tags."""
-RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE)
+RE_P12 = None
"""Table formatting."""
RE_P13 = re.compile(r'(?<=(\n[ ])|(\n\n)|([ ]{2})|(.\n)|(.\t))(\||\!)([^[\]\n]*?\|)*', re.UNICODE)
"""Table cell formatting."""
Mutant 64
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -72,7 +72,7 @@
"""All other tags."""
RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE)
"""Table formatting."""
-RE_P13 = re.compile(r'(?<=(\n[ ])|(\n\n)|([ ]{2})|(.\n)|(.\t))(\||\!)([^[\]\n]*?\|)*', re.UNICODE)
+RE_P13 = re.compile(r'XX(?<=(\n[ ])|(\n\n)|([ ]{2})|(.\n)|(.\t))(\||\!)([^[\]\n]*?\|)*XX', re.UNICODE)
"""Table cell formatting."""
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)
"""Categories."""
Mutant 65
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -72,7 +72,7 @@
"""All other tags."""
RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE)
"""Table formatting."""
-RE_P13 = re.compile(r'(?<=(\n[ ])|(\n\n)|([ ]{2})|(.\n)|(.\t))(\||\!)([^[\]\n]*?\|)*', re.UNICODE)
+RE_P13 = None
"""Table cell formatting."""
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)
"""Categories."""
Mutant 66
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -74,7 +74,7 @@
"""Table formatting."""
RE_P13 = re.compile(r'(?<=(\n[ ])|(\n\n)|([ ]{2})|(.\n)|(.\t))(\||\!)([^[\]\n]*?\|)*', re.UNICODE)
"""Table cell formatting."""
-RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)
+RE_P14 = re.compile(r'XX\[\[Category:[^][]*\]\]XX', re.UNICODE)
"""Categories."""
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
"""Remove File and Image templates."""
Mutant 67
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -74,7 +74,7 @@
"""Table formatting."""
RE_P13 = re.compile(r'(?<=(\n[ ])|(\n\n)|([ ]{2})|(.\n)|(.\t))(\||\!)([^[\]\n]*?\|)*', re.UNICODE)
"""Table cell formatting."""
-RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)
+RE_P14 = None
"""Categories."""
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
"""Remove File and Image templates."""
Mutant 68
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -76,7 +76,7 @@
"""Table cell formatting."""
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)
"""Categories."""
-RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
+RE_P15 = re.compile(r'XX\[\[([fF]ile:|[iI]mage)[^]]*(\]\])XX', re.UNICODE)
"""Remove File and Image templates."""
RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
"""Capture interlinks text and article linked"""
Mutant 69
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -76,7 +76,7 @@
"""Table cell formatting."""
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)
"""Categories."""
-RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
+RE_P15 = None
"""Remove File and Image templates."""
RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
"""Capture interlinks text and article linked"""
Mutant 70
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -78,7 +78,7 @@
"""Categories."""
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
"""Remove File and Image templates."""
-RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
+RE_P16 = re.compile(r'XX\[{2}(.*?)\]{2}XX', re.UNICODE)
"""Capture interlinks text and article linked"""
RE_P17 = re.compile(
r'(\n.{0,4}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=)|(scope=))(.*))|'
Mutant 71
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -78,7 +78,7 @@
"""Categories."""
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
"""Remove File and Image templates."""
-RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
+RE_P16 = None
"""Capture interlinks text and article linked"""
RE_P17 = re.compile(
r'(\n.{0,4}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=)|(scope=))(.*))|'
Mutant 72
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -81,7 +81,7 @@
RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
"""Capture interlinks text and article linked"""
RE_P17 = re.compile(
- r'(\n.{0,4}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=)|(scope=))(.*))|'
+ r'XX(\n.{0,4}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=)|(scope=))(.*))|XX'
r'(^.{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))',
re.UNICODE
)
Mutant 73
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -87,7 +87,7 @@
)
"""Table markup"""
IGNORED_NAMESPACES = [
- 'Wikipedia', 'Category', 'File', 'Portal', 'Template',
+ 'XXWikipediaXX', 'Category', 'File', 'Portal', 'Template',
'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
'Special', 'Talk'
]
Mutant 74
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -87,7 +87,7 @@
)
"""Table markup"""
IGNORED_NAMESPACES = [
- 'Wikipedia', 'Category', 'File', 'Portal', 'Template',
+ 'Wikipedia', 'XXCategoryXX', 'File', 'Portal', 'Template',
'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
'Special', 'Talk'
]
Mutant 75
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -87,7 +87,7 @@
)
"""Table markup"""
IGNORED_NAMESPACES = [
- 'Wikipedia', 'Category', 'File', 'Portal', 'Template',
+ 'Wikipedia', 'Category', 'XXFileXX', 'Portal', 'Template',
'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
'Special', 'Talk'
]
Mutant 76
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -87,7 +87,7 @@
)
"""Table markup"""
IGNORED_NAMESPACES = [
- 'Wikipedia', 'Category', 'File', 'Portal', 'Template',
+ 'Wikipedia', 'Category', 'File', 'XXPortalXX', 'Template',
'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
'Special', 'Talk'
]
Mutant 77
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -87,7 +87,7 @@
)
"""Table markup"""
IGNORED_NAMESPACES = [
- 'Wikipedia', 'Category', 'File', 'Portal', 'Template',
+ 'Wikipedia', 'Category', 'File', 'Portal', 'XXTemplateXX',
'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
'Special', 'Talk'
]
Mutant 78
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -88,7 +88,7 @@
"""Table markup"""
IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
- 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
+ 'XXMediaWikiXX', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
'Special', 'Talk'
]
"""MediaWiki namespaces that ought to be ignored."""
Mutant 79
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -88,7 +88,7 @@
"""Table markup"""
IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
- 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
+ 'MediaWiki', 'XXUserXX', 'Help', 'Book', 'Draft', 'WikiProject',
'Special', 'Talk'
]
"""MediaWiki namespaces that ought to be ignored."""
Mutant 80
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -88,7 +88,7 @@
"""Table markup"""
IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
- 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
+ 'MediaWiki', 'User', 'XXHelpXX', 'Book', 'Draft', 'WikiProject',
'Special', 'Talk'
]
"""MediaWiki namespaces that ought to be ignored."""
Mutant 81
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -88,7 +88,7 @@
"""Table markup"""
IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
- 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
+ 'MediaWiki', 'User', 'Help', 'XXBookXX', 'Draft', 'WikiProject',
'Special', 'Talk'
]
"""MediaWiki namespaces that ought to be ignored."""
Mutant 82
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -88,7 +88,7 @@
"""Table markup"""
IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
- 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
+ 'MediaWiki', 'User', 'Help', 'Book', 'XXDraftXX', 'WikiProject',
'Special', 'Talk'
]
"""MediaWiki namespaces that ought to be ignored."""
Mutant 83
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -88,7 +88,7 @@
"""Table markup"""
IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
- 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
+ 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'XXWikiProjectXX',
'Special', 'Talk'
]
"""MediaWiki namespaces that ought to be ignored."""
Mutant 84
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -89,7 +89,7 @@
IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
- 'Special', 'Talk'
+ 'XXSpecialXX', 'Talk'
]
"""MediaWiki namespaces that ought to be ignored."""
Mutant 85
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -89,7 +89,7 @@
IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
- 'Special', 'Talk'
+ 'Special', 'XXTalkXX'
]
"""MediaWiki namespaces that ought to be ignored."""
Mutant 86
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -186,7 +186,7 @@
return legit_interlinks
-def filter_wiki(raw, promote_remaining=True, simplify_links=True):
+def filter_wiki(raw, promote_remaining=False, simplify_links=True):
"""Filter out wiki markup from `raw`, leaving only text.
Parameters
Mutant 87
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -186,7 +186,7 @@
return legit_interlinks
-def filter_wiki(raw, promote_remaining=True, simplify_links=True):
+def filter_wiki(raw, promote_remaining=True, simplify_links=False):
"""Filter out wiki markup from `raw`, leaving only text.
Parameters
Mutant 88
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -211,7 +211,7 @@
return remove_markup(text, promote_remaining, simplify_links)
-def remove_markup(text, promote_remaining=True, simplify_links=True):
+def remove_markup(text, promote_remaining=False, simplify_links=True):
"""Filter out wiki markup from `text`, leaving only text.
Parameters
Mutant 89
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -211,7 +211,7 @@
return remove_markup(text, promote_remaining, simplify_links)
-def remove_markup(text, promote_remaining=True, simplify_links=True):
+def remove_markup(text, promote_remaining=True, simplify_links=False):
"""Filter out wiki markup from `text`, leaving only text.
Parameters
Mutant 90
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -339,7 +339,7 @@
return s
-def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
+def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=False):
"""Tokenize a piece of text from Wikipedia.
Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens.
Mutant 91
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -389,7 +389,7 @@
return namespace
-_get_namespace = get_namespace
+_get_namespace = None
def extract_pages(f, filter_namespaces=False, filter_articles=None):
Mutant 92
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -392,7 +392,7 @@
_get_namespace = get_namespace
-def extract_pages(f, filter_namespaces=False, filter_articles=None):
+def extract_pages(f, filter_namespaces=True, filter_articles=None):
"""Extract pages from a MediaWiki database dump.
Parameters
Mutant 93
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -454,7 +454,7 @@
elem.clear()
-_extract_pages = extract_pages # for backward compatibility
+_extract_pages = None # for backward compatibility
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
Mutant 94
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -458,7 +458,7 @@
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
- token_max_len=TOKEN_MAX_LEN, lower=True):
+ token_max_len=TOKEN_MAX_LEN, lower=False):
"""Parse a Wikipedia article, extract all tokens.
Notes
Mutant 95
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -577,7 +577,7 @@
"""
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
- filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
+ filter_namespaces=('XX0XX',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
"""Initialize the corpus.
Mutant 96
--- gensim/corpora/wikicorpus.py
+++ gensim/corpora/wikicorpus.py
@@ -578,7 +578,7 @@
"""
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
- token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
+ token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=False, filter_articles=None):
"""Initialize the corpus.
Unless a dictionary is provided, this scans the corpus once,