Source code for TELF.pre_processing.Vulture.detect_nonenglish

import re

COMMON_STOPWORDS = {'the','to','a','of','and','in','that','for','is','on','said','with','he','it','was','as','at','but','by','his','be','have','are','from','i','has','not','an','they','who','this','or','had','you','their','about','will','one','more','when','we','would','new','were','been','if','she','can','out','all','up','which','her','its','there','than','what','some','like','no','so','after','two','other','its','into','also','just','them','him','most','over','do','could','only','now','because','many','even','my','get','before','how','then','where','those','much','dont','any','may','here','still','made','good','through','did','our','while','going','off','me','these','think','against','such','since','down','very','another','being','go','should','too','under','both','during','between'}


[docs] def ascii_detect(text): """ Find the total number of ASCII characters in the dataset Parameters ---------- text : str String for which to find the number of ascii characters Returns ------- ascii_count : int Number of ascii characters in text total_count : int Total number of characters in text (excluding whitespace) """ text_no_whitespace = ''.join(text.split()) # remove whitespace ascii_count = sum(1 for c in text_no_whitespace if ord(c) < 128) # count ASCII characters total_count = len(text_no_whitespace) # count total characters return ascii_count, total_count
[docs] def simple_tokenize(text): """ Tokenize text without doing any serious pre-processing. Parameters ---------- text : str String that needs to be tokenized Returns ------- tokens : list(str) The list of tokens from text """ text = text.lower() # Convert the string to lowercase text = re.sub(r'[^a-z0-9\s-]+', '', text) # Remove all non-alphanumeric characters, except hyphens text = re.sub(r'\s+', ' ', text).strip() # Remove extra white spaces return text.split()
[docs] def is_english(document: str, ascii_ratio=0.9, stopwords_used=0.3): """ Performs language detection on the given documents Parameters ---------- document : str The document text for which to check if english Returns ------- is_english : bool Is the document English (according to this heuristic) """ ascii_count, total_count = ascii_detect(document) if ascii_count / total_count < ascii_ratio: return False tokens = simple_tokenize(document) num_stopwords = sum(1 for t in tokens if t in COMMON_STOPWORDS) if num_stopwords / len(tokens) < stopwords_used: return False return True