In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_PROJECT"] = "thainer-corpus-v2-wangchanberta-base-att-spm-uncased"

In [2]:
from datasets import load_dataset
dataset = load_dataset("pythainlp/thainer-corpus-v2")

Found cached dataset parquet (/root/.cache/huggingface/datasets/pythainlp___parquet/pythainlp--thainer-corpus-v2-1c60fca84b0610a8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
pretrained="airesearch/wangchanberta-base-att-spm-uncased"

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained)

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
tokenized_ner = dataset.map(tokenize_and_align_labels, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/pythainlp___parquet/pythainlp--thainer-corpus-v2-1c60fca84b0610a8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f8e644be8b4d9c36.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/pythainlp___parquet/pythainlp--thainer-corpus-v2-1c60fca84b0610a8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8933e61b9584a714.arrow


Map:   0%|          | 0/1313 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['words', 'ner'],
        num_rows: 3938
    })
    test: Dataset({
        features: ['words', 'ner'],
        num_rows: 1313
    })
    validation: Dataset({
        features: ['words', 'ner'],
        num_rows: 1313
    })
})

In [8]:
import numpy as np
label_list=dataset["train"].features["ner"].feature._int2str
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [9]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [10]:
import evaluate

seqeval = evaluate.load("seqeval")

In [11]:
num_labels=len(dataset["train"].features["ner"].feature._int2str)

In [12]:
id2label={i:j for i,j in enumerate(dataset["train"].features["ner"].feature._int2str)}
label2id=dataset["train"].features["ner"].feature._str2int
#dir(dataset["train"].features["ner"].feature)

In [13]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    pretrained, num_labels=num_labels, id2label=id2label, label2id=label2id
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier

In [14]:
training_args = TrainingArguments(
    output_dir="thainer_corpus_v2_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ner["train"],
    eval_dataset=tokenized_ner["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mwannaphong[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.166725,0.66928,0.773248,0.717518,0.954487
2,No log,0.127984,0.76306,0.82121,0.791068,0.964897
3,0.298600,0.118332,0.786086,0.84599,0.814938,0.967959
4,0.298600,0.111184,0.795116,0.867573,0.829766,0.970448
5,0.079300,0.112624,0.810998,0.872369,0.840565,0.972009
6,0.079300,0.107708,0.830337,0.873701,0.851467,0.973648
7,0.046200,0.109994,0.832746,0.880895,0.856144,0.974063
8,0.046200,0.114098,0.830905,0.881162,0.855295,0.974182
9,0.031800,0.115497,0.83894,0.877165,0.857627,0.974359
10,0.031800,0.115564,0.834763,0.881695,0.857587,0.974636


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2470, training_loss=0.09711394445133595, metrics={'train_runtime': 834.7371, 'train_samples_per_second': 47.177, 'train_steps_per_second': 2.959, 'total_flos': 4396882130453232.0, 'train_loss': 0.09711394445133595, 'epoch': 10.0})

In [16]:
model.push_to_hub("pythainlp/thainer-corpus-v2-base-model")
tokenizer.push_to_hub("pythainlp/thainer-corpus-v2-base-model")

pytorch_model.bin:   0%|          | 0.00/419M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pythainlp/thainer-corpus-v2-wangchanberta-base-att-spm-uncased/commit/13ff66769b5637e38250272fb6704fe7efb510f3', commit_message='Upload tokenizer', commit_description='', oid='13ff66769b5637e38250272fb6704fe7efb510f3', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.10770813375711441,
 'eval_precision': 0.830336794125095,
 'eval_recall': 0.873701039168665,
 'eval_f1': 0.8514671513892494,
 'eval_accuracy': 0.9736483416628805,
 'eval_runtime': 9.6752,
 'eval_samples_per_second': 135.707,
 'eval_steps_per_second': 8.579,
 'epoch': 10.0}

In [21]:
trainer.evaluate(tokenized_ner["test"])

{'eval_loss': 0.10092544555664062,
 'eval_precision': 0.8199168093956447,
 'eval_recall': 0.8781446540880503,
 'eval_f1': 0.8480323927622422,
 'eval_accuracy': 0.9724346779516247,
 'eval_runtime': 9.7825,
 'eval_samples_per_second': 134.22,
 'eval_steps_per_second': 8.485,
 'epoch': 10.0}