20210407

Some huggingface/transformer processing scripts


colab to do NER training.


Convert conll2003 dataset to json(l) using datasets library from huggingface. Used to produce files to illustrate the format of the json when using json files for training.

Related:

from datasets import load_dataset, load_metric
import json

datasets = load_dataset("conll2003")

label_list = datasets["train"].features["ner_tags"].feature.names

with open("conll2003_train.json", "w") as outfile:.
    for i in range(datasets["train"].num_rows):
        dictionary = {"tokens": datasets["train"][i]["tokens"],"tags": [label_list[i] for i in datasets["train"][i]["ner_tags"]]}
        #json_object = json.dumps(dictionary)
        #print(json_object)
        json.dump(dictionary, outfile)
        outfile.write('\n')

label_list = datasets["validation"].features["ner_tags"].feature.names

with open("conll2003_validation.json", "w") as outfile:.
    for i in range(datasets["validation"].num_rows):
        dictionary = {"tokens": datasets["validation"][i]["tokens"],"tags": [label_list[i] for i in datasets["validation"][i]["ner_tags"]]}
        #json_object = json.dumps(dictionary)
        #print(json_object)
        json.dump(dictionary, outfile)
        outfile.write('\n')

#print(label_list)

Converts from the 'token tag' format - that we have in swenerc - to json(l). This can be used with huggingface transformer. Arbitrary truncation att 255 in order to avoid 'The size of tensor a (631) must match the size of tensor b (512) at non-singleton dimension 1' etc.

# coding=utf-8

import sys
import json

if (len(sys.argv) == 3):
    ifarg = sys.argv[1]
    ofarg = sys.argv[2]
else:
    ifarg = "dum.txt"
    ofarg = "dum.json"


print("In:", ifarg, "Out:", ofarg)

with open(ifarg, encoding="utf-8", mode="r") as infile, open(ofarg, encoding="utf-8", mode="w") as outfile:
    tokens = []
    ner_tags = []
    for line in infile:
        #print(line)
        if len(line.strip()) == 0:
            if tokens:
                if (len(tokens) > 255):
                    print("Too long, truncating...")
                    print(tokens)
                    tokens = tokens[:255]
                    ner_tags = ner_tags[:255]
                    print(tokens)
                dictionary = {"tokens": tokens, "ner_tags": ner_tags}
                json.dump(dictionary, outfile, ensure_ascii=False)
                outfile.write('\n')

                tokens = []
                ner_tags = []
        else:
            # conll2003 tokens are space separated
            splits = line.split(" ")
            tokens.append(splits[0])
            ner_tags.append(splits[1].rstrip())
    # last example
    if tokens:
        dictionary = {"tokens": tokens, "ner_tags": ner_tags}
        json.dump(dictionary, outfile, ensure_ascii=False)
        outfile.write('\n')