Skip to content
Aurora-Network-Global /sdgs_many_berts Type # for issues and pull requests, > for commands, and ? for help Type # for issues, pull requests, and projects, > for commands, and ? for help Type # for issues, pull requests, and projects, / for files, and > for commands
We’ve encountered an error and some results aren't available at this time. Type a new search or try again later.
No results matched your search
Search for issues and pull requests # Search for issues, pull requests, discussions, and projects # Search for organizations, repositories, and users @ Search for projects ! Search for files / Activate command mode > Search your issues, pull requests, and discussions # author:@me Search your issues, pull requests, and discussions # author:@me Filter to pull requests # is:pr Filter to issues # is:issue Filter to discussions # is:discussion Filter to projects # is:project Filter to open issues, pull requests, and discussions # is:open
  • Watch

    Notifications

  • Fork

    Fork sdgs_many_berts

    Loading

    If this dialog fails to load, you can visit the fork page directly.

Open in github.dev
Permalink
@rbrtjwrk
Latest commit 0961408 Dec 4, 2021 History
1 contributor

Users who have contributed to this file

Loading
169 lines (126 sloc) 5.13 KB
# IMPORTS
import pandas as pd
import glob
from nltk import tokenize
from transformers import BertTokenizer, TFBertModel, BertConfig
from transformers.utils.dummy_tf_objects import TFBertMainLayer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow import convert_to_tensor
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall
# SET PARAMETERS
DATA=".../"
MODELS=".../"
SAVE_PREDICTIONS_TO=".../"
# PREPROCESS TEXTS
def tokenize_abstracts(abstracts):
"""For a given texts, adds '[CLS]' and '[SEP]' tokens
at the beginning and the end of each sentence, respectively.
"""
t_abstracts=[]
for abstract in abstracts:
t_abstract="[CLS] "
for sentence in tokenize.sent_tokenize(abstract):
t_abstract=t_abstract + sentence + " [SEP] "
t_abstracts.append(t_abstract)
return t_abstracts
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
def b_tokenize_abstracts(t_abstracts, max_len=512):
"""Tokenizes sentences with the help
of a 'bert-base-multilingual-uncased' tokenizer.
"""
b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
return b_t_abstracts
def convert_to_ids(b_t_abstracts):
"""Converts tokens to its specific
IDs in a bert vocabulary.
"""
input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
return input_ids
def abstracts_to_ids(abstracts):
"""Tokenizes abstracts and converts
tokens to their specific IDs
in a bert vocabulary.
"""
tokenized_abstracts=tokenize_abstracts(abstracts)
b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
ids=convert_to_ids(b_tokenized_abstracts)
return ids
def pad_ids(input_ids, max_len=512):
"""Padds sequences of a given IDs.
"""
p_input_ids=pad_sequences(input_ids,
maxlen=max_len,
dtype="long",
truncating="post",
padding="post")
return p_input_ids
def create_attention_masks(inputs):
"""Creates attention masks
for a given seuquences.
"""
masks=[]
for seq in inputs:
seq_mask=[float(i>0) for i in seq]
masks.append(seq_mask)
return masks
# PREDICT
def float_to_percent(float, decimal=3):
"""Takes a float from range 0. to 0.9... as an input
and converts it to a percentage with specified decimal places.
"""
return str(float*100)[:(decimal+3)]+"%"
def models_predict(directory, inputs, attention_masks, float_to_percent=False):
"""Loads separate .h5 models from a given directory.
For predictions, inputs are expected to be:
tensors of token's ids (bert vocab) and tensors of attention masks.
Output is of format:
{'model/target N': [the probability of a text N dealing with the target N , ...], ...}
"""
models=glob.glob(f"{directory}*.h5")
predictions_dict={}
for _ in models:
model=tf.keras.models.load_model(_)
predictions=model.predict_step([inputs, attention_masks])
predictions=[float(_) for _ in predictions]
if float_to_percent==True:
predictions=[float_to_percent(_) for _ in predictions]
predictions_dict[model.name]=predictions
del predictions, model
return predictions_dict
def predictions_dict_to_df(predictions_dictionary):
"""Converts model's predictions of format:
{'model/target N': [the probability of a text N dealing with the target N , ...], ...}
to a dataframe of format:
| text N | the probability of the text N dealing with the target N | ... |
"""
predictions_df=pd.DataFrame(predictions_dictionary)
predictions_df.columns=[_.replace("model_", "").replace("_", ".") for _ in predictions_df.columns]
predictions_df.insert(0, column="text", value=[_ for _ in range(len(predictions_df))])
return predictions_df
def predictions_above_treshold(predictions_dataframe, treshold=0.95):
"""Filters predictions above specified treshold.
Input is expected to be a dataframe of format:
| text N | the probability of the text N dealing with the target N | ... |
Output is of format:
{text N: [target N dealing with probability > trheshold with text N, ...], ...}
"""
above_treshold_dict={}
above_treshold=predictions_dataframe.iloc[:,1:].apply(lambda row: row[row > treshold].index, axis=1)
for _ in range(len(above_treshold)):
above_treshold_dict[_]=list(above_treshold[_])
return above_treshold_dict
# RUN
# abstracts=load list of texts/abstracts from DATA/
ids=abstracts_to_ids(abstracts)
padded_ids=pad_ids(ids)
masks=create_attention_masks(padded_ids)
masks=convert_to_tensor(masks)
inputs=convert_to_tensor(padded_ids)
predictions=models_predict(directory=MODELS, inputs, masks)
predictions_df=predictions_dict_to_df(predictions)
predictions_df.to_excel("SAVE_PREDICTIONS_TO/predictions.xlsx", index=False)