该模型在保加利亚语子集上进行了微调。
导入库:
from typing import List, Dict import torch from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
首先,您需要定义这些方法,因为我们使用的是子词Tokenizer:
def predict(
text: str,
model: torch.nn.Module,
tokenizer: AutoTokenizer,
labels_tags={
0: "O",
1: "B-PER", 2: "I-PER",
3: "B-ORG", 4: "I-ORG",
5: "B-LOC", 6: "I-LOC"
}) -> List[Dict[str, str]]:
tokens_data = tokenizer(text)
tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"])
words = subwords_to_words(tokens)
input_ids = torch.LongTensor(tokens_data["input_ids"]).unsqueeze(0)
attention_mask = torch.LongTensor(tokens_data["attention_mask"]).unsqueeze(0)
out = model(input_ids, attention_mask=attention_mask).logits
out = out.argmax(-1).squeeze(0).tolist()
prediction = [labels_tags[idx] if idx in labels_tags else idx for idx in out]
return merge_words_and_predictions(words, prediction)
def subwords_to_words(tokens: List[str]) -> List[str]:
out_tokens = []
curr_token = ""
tags = []
for token in tokens:
if token == "[SEP]":
curr_token = curr_token.replace("▁", "")
out_tokens.append(curr_token)
out_tokens.append("[SEP]")
break
if "▁" in token and curr_token == "":
curr_token += token
elif "▁" in token and curr_token != "":
curr_token = curr_token.replace("▁", "")
out_tokens.append(curr_token)
curr_token = ""
curr_token += token
elif "▁" not in token:
curr_token += token
return out_tokens
def merge_words_and_predictions(words: List[str], entities: List[str]) -> List[Dict[str, str]]:
result = []
curr_word = []
for i, (word, entity) in enumerate(zip(words[1:], entities[1:])):
if "B-" in entity:
if curr_word:
curr_word = " ".join(curr_word)
result.append({
"word": curr_word,
"entity_group": entities[i][2:]
})
curr_word = [word]
else:
curr_word.append(word)
if "I-" in entity:
curr_word.append(word)
if "O" == entity:
if curr_word:
curr_word = " ".join(curr_word)
result.append({
"word": curr_word,
"entity_group": entities[i][2:]
})
curr_word = []
return result
然后,您应该初始化 AutoTokenizer 和 AutoModelForTokenClassification 对象:
MODEL_ID = "auhide/bert-bg-ner" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
最后,您可以调用上面的 predict() 方法:
text = "Барух Спиноза е роден в Амстердам"
print(f"Input: {text}")
print("NERs:", predict(text, model=model, tokenizer=tokenizer))
Input: Барух Спиноза е роден в Амстердам
NERs: [{'word': 'Барух Спиноза', 'entity_group': 'PER'}, {'word': 'Амстердам', 'entity_group': 'LOC'}]
注意:有三种类型的实体 - PER,ORG,LOC。