tokenizer("Hello, this is one sentence!")tokenizer(["Hello", ",", "this", "is", "one", "sentence", "split", "into", "words", "."], is_split_into_words=True)
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"{task}_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))
deftokenize_and_align_labels(examples): tokenized_inputs =tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) labels = []for i, label inenumerate(examples[f"{task}_tags"]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx =None label_ids = []for word_idx in word_ids:# Special tokens have a word id that is None. We set the label to -100 so they are automatically# ignored in the loss function.if word_idx isNone: label_ids.append(-100)# We set the label for the first token of each word.elif word_idx != previous_word_idx: label_ids.append(label[word_idx])# For the other tokens in a word, we set the label to either the current label or -100, depending on# the label_all_tokens flag.else: label_ids.append(label[word_idx] if label_all_tokens else-100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"]= labelsreturn tokenized_inputs
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
results