In [1]:
import os
import glob
import pandas as pd
import re
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("thainer-v2.0.txt", 'r', encoding='utf-8-sig') as f:
    raw_list = [i.strip() for i in f.read().splitlines()]

In [3]:
len(raw_list)

6564

In [4]:
raw_list[-1]

'[ORGANIZATION]TikTok[/ORGANIZATION] ปรับปรุงฟีเจอร์ Live - ต้องอายุอย่างน้อย [AGO]18 ปี[/AGO] ถึงสามารถไลฟ์ได้'

In [5]:
def replace_tag(txt):
    return txt.replace("FACILITY","LOCATION").replace("[AGO]","").replace("[/AGO]","").replace("[T]","").replace("[/T]","")

In [6]:
from easynertag import Engine
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag as simple_pos
from tqdm.auto import tqdm

In [7]:
def cut_word(txt):
    txt = replace_tag(txt)
    return word_tokenize(txt, engine="newmm")

In [8]:
builder=Engine(word_tokenize=cut_word,pos_tag=simple_pos)

In [9]:
print(builder.text2conll2002(raw_list[-1]))

TikTok	B-ORGANIZATION
 	O
ปรับปรุง	O
ฟีเจอร์	O
 	O
Live	O
 	O
-	O
 	O
ต้อง	O
อายุ	O
อย่าง	O
น้อย	O
 	O
18	B-AGO
 	I-AGO
ปี	I-AGO
 	O
ถึง	O
สามารถ	O
ไลฟ์	O
ได้	O



In [10]:
list_text2conll2002 = []
for i in tqdm(raw_list):
    list_text2conll2002.append(builder.text2conll2002(i))

100%|█████████████████████████████████████| 6564/6564 [00:03<00:00, 1714.12it/s]


In [11]:
list_text2conll2002[0].splitlines()

['http\tB-URL',
 '://\tI-URL',
 'www\tI-URL',
 '.\tI-URL',
 'bangkokhealth\tI-URL',
 '.\tI-URL',
 'com\tI-URL',
 '/\tI-URL',
 'healthnews\tI-URL',
 '_\tI-URL',
 'htdoc\tI-URL',
 '/\tI-URL',
 'healthnews\tI-URL',
 ' \tI-URL',
 '_\tI-URL',
 ' \tI-URL',
 'detail\tI-URL',
 '.\tI-URL',
 'asp\tI-URL',
 '?\tI-URL',
 'Number\tI-URL',
 '=\tI-URL',
 '10506\tI-URL']

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
# Thank https://datascience.stackexchange.com/a/15136
train,test = train_test_split(list_text2conll2002, test_size=0.2, random_state=1)
train, val = train_test_split(train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [14]:
len(train),len(val),len(test)

(3938, 1313, 1313)

In [15]:
tag2idx=[]
def get_idx(t):
    if t not in set(tag2idx):
        tag2idx.append(t)
    return tag2idx.index(t)

In [17]:
def make_hf(list_conll2002):
    _words=[]
    _tags=[]
    for i in list_conll2002:
        _w=[]
        _t=[]
        for j in i.splitlines():
            _temp = j.split("\t")
            if len(_temp)!=2:
                continue
            _w.append(_temp[0])
            if _temp[1]=="B-/ORGANIZATION":
                print(i)
            _t.append(get_idx(_temp[1]))#.replace("B-","").replace("I-",""))
        if len(_w)<1:
            continue
        _words.append(_w)
        _tags.append(_t)
    return _words,_tags

In [18]:
train_hf = make_hf(train)
val_hf = make_hf(val)
test_hf = make_hf(test)

In [20]:
tag2idx[tag2idx.index("B-T")]="B-TEMPERATURE"
tag2idx[tag2idx.index("I-T")]="I-TEMPERATURE"

In [21]:
tag2idx

['B-PERSON',
 'I-PERSON',
 'O',
 'B-ORGANIZATION',
 'B-LOCATION',
 'I-ORGANIZATION',
 'I-LOCATION',
 'B-DATE',
 'I-DATE',
 'B-TIME',
 'I-TIME',
 'B-MONEY',
 'I-MONEY',
 'B-FACILITY',
 'I-FACILITY',
 'B-URL',
 'I-URL',
 'B-PERCENT',
 'I-PERCENT',
 'B-LEN',
 'I-LEN',
 'B-AGO',
 'I-AGO',
 'B-LAW',
 'I-LAW',
 'B-PHONE',
 'I-PHONE',
 'B-EMAIL',
 'I-EMAIL',
 'B-ZIP',
 'B-TEMPERATURE',
 'I-TEMPERATURE',
 'B-DTAE',
 'I-DTAE',
 'B-DATA',
 'I-DATA']

In [28]:
d={
    "train":{
        "words":train_hf[0],
        "ner":train_hf[1]
    },
    "validation":{
        "words":val_hf[0],
        "ner":val_hf[1]
    },
    "test":{
        "words":test_hf[0],
        "ner":test_hf[1]
    }
}


In [29]:
from datasets import Dataset
import datasets

In [30]:
features = datasets.Features(
            {
                "words": datasets.Sequence(datasets.Value("string")),
                "ner": datasets.Sequence(
                    datasets.features.ClassLabel(
                        names=tag2idx
                    )
                ),
            }
        )

In [31]:
dataset = datasets.DatasetDict()
# using your `Dict` object
for k,v in d.items():
    dataset[k] = Dataset.from_dict(v,features=features)

In [32]:
dataset

DatasetDict({
    train: Dataset({
        features: ['words', 'ner'],
        num_rows: 3938
    })
    validation: Dataset({
        features: ['words', 'ner'],
        num_rows: 1313
    })
    test: Dataset({
        features: ['words', 'ner'],
        num_rows: 1313
    })
})

In [33]:
dataset.push_to_hub("pythainlp/thainer-corpus-v2")

Pushing split train to the Hub.
Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|████████| 4/4 [00:00<00:00, 209.85ba/s][A

Upload 1 LFS files:   0%|                                 | 0/1 [00:00<?, ?it/s][A
Upload 1 LFS files: 100%|█████████████████████████| 1/1 [00:05<00:00,  5.49s/it][A
Pushing dataset shards to the dataset hub: 100%|██| 1/1 [00:09<00:00,  9.82s/it]
Pushing split validation to the Hub.
Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|████████| 2/2 [00:00<00:00, 256.22ba/s][A

Upload 1 LFS files:   0%|                                 | 0/1 [00:00<?, ?it/s][A
Upload 1 LFS files: 100%|█████████████████████████| 1/1 [00:04<00:00,  4.28s/it][A
Pushing dataset shards to the dataset hub: 100%|██| 1/1 [00:08<00:00,  8.56s/it]
Pushing split test to the Hub.
Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00

In [34]:
with open("train.conll","w",encoding="utf-8") as f:
    f.write('\n'.join(train))
with open("validation.conll","w",encoding="utf-8") as f:
    f.write('\n'.join(val))
with open("test.conll","w",encoding="utf-8") as f:
    f.write('\n'.join(test))