Skip to content
Aurora-Network-Global /sdgs_many_berts Type # for issues and pull requests, > for commands, and ? for help Type # for issues, pull requests, and projects, > for commands, and ? for help Type # for issues, pull requests, and projects, / for files, and > for commands
We’ve encountered an error and some results aren't available at this time. Type a new search or try again later.
No results matched your search
Search for issues and pull requests # Search for issues, pull requests, discussions, and projects # Search for organizations, repositories, and users @ Search for projects ! Search for files / Activate command mode > Search your issues, pull requests, and discussions # author:@me Search your issues, pull requests, and discussions # author:@me Filter to pull requests # is:pr Filter to issues # is:issue Filter to discussions # is:discussion Filter to projects # is:project Filter to open issues, pull requests, and discussions # is:open
  • Watch

    Notifications

  • Fork

    Fork sdgs_many_berts

    Loading

    If this dialog fails to load, you can visit the fork page directly.

Open in github.dev
Permalink
 
 
Cannot retrieve contributors at this time
218 lines (179 sloc) 7.36 KB
# IMPORTS
import pandas as pd
from transformers import BertConfig, BertTokenizer
from nltk import tokenize
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow import convert_to_tensor
from transformers import TFBertModel, BertConfig
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.metrics import BinaryAccuracy, Precision, Recall
import time
# SET PARAMETERS
DATA_PATH=".../SDGs_merged_cleaned_onehot_no_zeros_no_duplicates_no_t13.h5"
SAVE_MODELS_TO=".../"
# READ DATA
tab=pd.read_hdf(DATA_PATH)
# SLICE DATA
def slice_data(dataframe, label):
"""Slices dataframe of a structure:
| text/abstract | label |
Prepares data for a binary classification
training. For a given label, creates new
dataset where number of items belonging
to the given label equals number of randomly
generated items from all the other labels items.
"""
label_data=dataframe[dataframe[label]==1]
label_data_len=len(label_data)
temp_data=dataframe.copy()[dataframe[label]!=1].sample(n=label_data_len)
label_data=label_data[["Abstract", label]]
label_data=label_data.append(temp_data[["Abstract", label]])
label_data.columns=["Abstract", "Label"]
return label_data
# PREPARE DATA FOR BERT
def data_to_values(dataframe):
"""Converts data to values.
"""
abstracts=dataframe.Abstract.values
labels=dataframe.Label.values
return abstracts, labels
def tokenize_abstracts(abstracts):
"""For a given texts, adds '[CLS]' and '[SEP]' tokens
at the beginning and the end of each sentence, respectively.
"""
t_abstracts=[]
for abstract in abstracts:
t_abstract="[CLS] "
for sentence in tokenize.sent_tokenize(abstract):
t_abstract=t_abstract + sentence + " [SEP] "
t_abstracts.append(t_abstract)
return t_abstracts
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
def b_tokenize_abstracts(t_abstracts, max_len=512):
"""Tokenizes sentences with the help
of a 'bert-base-multilingual-uncased' tokenizer.
"""
b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
return b_t_abstracts
def convert_to_ids(b_t_abstracts):
"""Converts tokens to its specific
IDs in a bert vocabulary.
"""
input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
return input_ids
def abstracts_to_ids(abstracts):
"""Tokenizes abstracts and converts
tokens to their specific IDs
in a bert vocabulary.
"""
tokenized_abstracts=tokenize_abstracts(abstracts)
b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
ids=convert_to_ids(b_tokenized_abstracts)
return ids
def pad_ids(input_ids, max_len=512):
"""Padds sequences of a given IDs.
"""
p_input_ids=pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post")
return p_input_ids
def create_attention_masks(inputs):
"""Creates attention masks
for a given seuquences.
"""
masks=[]
for seq in inputs:
seq_mask=[float(i>0) for i in seq]
masks.append(seq_mask)
return masks
# CREATE MODEL
def create_model(label):
config=BertConfig.from_pretrained(
"bert-base-multilingual-uncased",
num_labels=2,
hidden_dropout_prob=0.2,
attention_probs_dropout_prob=0.2)
bert=TFBertModel.from_pretrained(
"bert-base-multilingual-uncased",
config=config)
bert_layer=bert.layers[0]
input_ids_layer=Input(
shape=(512),
name="input_ids",
dtype="int32")
input_attention_masks_layer=Input(
shape=(512),
name="attention_masks",
dtype="int32")
bert_model=bert_layer(
input_ids_layer,
input_attention_masks_layer)
target_layer=Dense(
units=1,
kernel_initializer=TruncatedNormal(stddev=config.initializer_range),
name="target_layer",
activation="sigmoid")(bert_model[1])
model=Model(
inputs=[input_ids_layer, input_attention_masks_layer],
outputs=target_layer,
name="model_"+label.replace(".", "_"))
optimizer=Adam(
learning_rate=5e-05,
epsilon=1e-08,
decay=0.01,
clipnorm=1.0)
model.compile(
optimizer=optimizer,
loss="binary_crossentropy",
metrics=[BinaryAccuracy(), Precision(), Recall()])
return model
# THE LOOP
histories=[]
test_scores=[]
elapsed_times=[]
for _ in tab.columns[4:]:
print(f"PROCESSING TARGET {_}...")
start_time=time.process_time()
data=slice_data(tab, _)
print("Data sliced.")
abstracts, labels=data_to_values(data)
ids=abstracts_to_ids(abstracts)
print("Abstracts tokenized, tokens converted to ids.")
padded_ids=pad_ids(ids)
print("Sequences padded.")
train_inputs, temp_inputs, train_labels, temp_labels=train_test_split(padded_ids, labels, random_state=1993, test_size=0.3)
validation_inputs, test_inputs, validation_labels, test_labels=train_test_split(temp_inputs, temp_labels, random_state=1993, test_size=0.5)
print("Data splited into train, validation, test sets.")
train_masks, validation_masks, test_masks=[create_attention_masks(_) for _ in [train_inputs, validation_inputs, test_inputs]]
print("Attention masks created.")
train_inputs, validation_inputs, test inputs=[convert_to_tensor(_) for _ in [train_inputs, validation_inputs, test_inputs]]
print("Inputs converted to tensors.")
train_labels, validation_labels, test_labels=[convert_to_tensor(_) for _ in [train_lables, validation_labels, test_labels]]
print("Labels converted to tensors.")
train_masks, validation_masks, test_masks=[convert_to_tensor(_) for _ in [train_masks, validation_masks, test_masks]]
print("Masks converted to tensors.")
model=create_model(_)
print("Model initialized.")
history=model.fit([train_inputs, train_masks], train_labels,
batch_size=3,
epochs=3,
validation_data=([validation_inputs, validation_masks], validation_labels))
histories.append(history)
print(f"Model for {_} target trained.")
model.save(SAVE_MODELS_TO+_.replace(".", "_")+".h5")
print(f"Model for target {_} saved.")
test_score=model.evaluate([test_inputs, test_masks], test_labels,
batch_size=3)
elapsed_times.append(time.process_time()-start_time)
test_scores.append(test_score)
print(f"""Model for target {_} tested.
.
.
.""")
# SAVE STATISTICS
stats=pd.DataFrame(test_scores, columns=["loss", "accuracy", "precision", "recall"])
stats.insert(loc=0, "target", tab.columns[4:])
stats.insert(loc=5, "elapsed_time", elapsed_times)
stats.to_excel(SAVE_MODELS_TO+"_stats.xlsx", index=False)