数据集:
diwank/silicone-merged
许可:
Merged and simplified dialog act datasets from the silicone collection
原始集合的所有子集已经经过过滤(用于错误和模糊类别),合并并分组为对话的两轮。假设通过包括先前的话语来训练对话行为分类器可以帮助模型获取附加的上下文线索,并在推理时表现更好,尤其是如果提供了一对话轮。
from datasets import load_dataset
from simpletransformers.classification import (
ClassificationModel, ClassificationArgs
)
# Get data
silicone_merged = load_dataset("diwank/silicone-merged")
train_df = silicone_merged["train"]
eval_df = silicone_merged["validation"]
model_args = ClassificationArgs(
num_train_epochs=8,
model_type="deberta",
model_name="microsoft/deberta-large",
use_multiprocessing=False,
evaluate_during_training=True,
)
# Create a ClassificationModel
model = ClassificationModel("deberta", "microsoft/deberta-large", args=model_args, num_labels=11) # 11 labels in this dataset
# Train model
model.train_model(train_df, eval_df=eval_df)
注意:该数据集高度不平衡,建议在进行训练之前使用类似 imbalanced-learn 的库。
由于平衡可能会很复杂且耗费资源,我们分享了一个通过使用imbalanced-learn库进行过采样创建的平衡的训练集变体。平衡使用SMOTEN算法处理分类数据聚类,并在16核,60GB RAM机器上重新采样。您可以使用以下方式访问:
load_dataset("diwank/silicone-merged", "balanced")
[
(0, 'acknowledge')
(1, 'answer')
(2, 'backchannel')
(3, 'reply_yes')
(4, 'exclaim')
(5, 'say')
(6, 'reply_no')
(7, 'hold')
(8, 'ask')
(9, 'intent')
(10, 'ask_yes_no')
]
mapping = {
"acknowledge": {
"swda": [
"aap_am",
"b",
"bk"
],
"mrda": [],
"oasis": [
"ackn",
"accept",
"complete"
],
"maptask": [
"acknowledge",
"align"
],
"dyda_da": [
"commissive"
]
},
"answer": {
"swda": [
"bf",
],
"mrda": [],
"oasis": [
"answ",
"informCont",
"inform",
"answElab",
"directElab",
"refer"
],
"maptask": [
"reply_w",
"explain"
],
"dyda_da": [
"inform"
]
},
"backchannel": {
"swda": [
"ad",
"bh",
"bd",
"b^m"
],
"mrda": [
"b"
],
"oasis": [
"backch",
"selfTalk",
"init"
],
"maptask": ["ready"],
"dyda_da": []
},
"reply_yes": {
"swda": [
"na",
"aa"
],
"mrda": [],
"oasis": [
"confirm"
],
"maptask": [
"reply_y"
],
"dyda_da": []
},
"exclaim": {
"swda": [
"ft",
"fa",
"fc",
"fp"
],
"mrda": [],
"oasis": [
"appreciate",
"bye",
"exclaim",
"greet",
"thank",
"pardon",
"thank-identitySelf",
"expressRegret"
],
"maptask": [],
"dyda_da": []
},
"say": {
"swda": [
"qh",
"sd"
],
"mrda": ["s"],
"oasis": [
"expressPossibility",
"expressOpinion",
"suggest"
],
"maptask": [],
"dyda_da": []
},
"reply_no": {
"swda": [
"nn",
"ng",
"ar"
],
"mrda": [],
"oasis": [
"refuse",
"negate"
],
"maptask": [
"reply_n"
],
"dyda_da": []
},
"hold": {
"swda": [
"^h",
"t1"
],
"mrda": [
"f"
],
"oasis": [
"hold"
],
"maptask": [],
"dyda_da": []
},
"ask": {
"swda": [
"qw",
"qo",
"qw^d",
"br",
"qrr"
],
"mrda": [
"q"
],
"oasis": [
"reqInfo",
"reqDirect",
"offer"
],
"maptask": [
"query_w"
],
"dyda_da": [
"question"
]
},
"intent": {
"swda": [],
"mrda": [],
"oasis": [
"informIntent",
"informIntent-hold",
"expressWish",
"direct",
"raiseIssue",
"correct"
],
"maptask": [
"instruct",
"clarify"
],
"dyda_da": [
"directive"
]
},
"ask_yes_no": {
"swda": [
"qy^d",
"^g"
],
"mrda": [],
"oasis": [
"reqModal"
],
"maptask": [
"query_yn",
"check"
],
"dyda_da": []
}
}