Source code for TELF.pre_processing.Vulture.detect_language
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 22 10:13:30 2022
@author: maksim
"""
# from langdetect import detect
[docs]
def get_language(document: str, document_id: str, n_words_use: int) -> tuple:
"""
This method is currently disabled.
Performs language detection on the given documents
Parameters
----------
documents : dict
Dictionary of documents to clean. In this dictionary, keys are the unique document
identifiers, and values are the text to clean.
n_words_use : int
Number of tokens to use when detecting langauge.
Returns
-------
languages : dict
List of tuples with document ID and language pairs.
"""
return (document_id, "unknown")
"""
# empty text
if not isinstance(document, str) or len(document) == 0:
lang = "unknown"
# get tokens
tokens = document.split()
num_tokens = len(tokens)
if (num_tokens < n_words_use) or (n_words_use == -1):
target_text = document
else:
target_text = " ".join(tokens)[:n_words_use]
# detect language
try:
lang = detect(target_text)
except Exception:
lang = "unknown"
return (document_id, lang)
"""