具体使用方法可以看我的博客:https://blog.csdn.net/weixin_40015791/article/details/90410083
下面也会简单介绍一下:在bert开源代码中的run_classifier.py中找到
processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mrpc": MrpcProcessor,
"xnli": XnliProcessor,
"intentdetection":IntentDetectionProcessor,
"emotion":EmotionProcessor, #新加上这一行
}
然后在该文件中增加一个class:
class EmotionProcessor(DataProcessor):
"""Processor for the MRPC data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "fine_tuning_train_data.tsv")), "train") #此处的名字和文件夹中的训练集的名字要保持一致
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "fine_tuning_val_data.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "fine_tuning_test_data.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1","2","3","4","5","6"] #七分类则从0到6
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
if set_type == "test":
label = "0"
text_a = tokenization.convert_to_unicode(line[0])
else:
label = tokenization.convert_to_unicode(line[0])
text_a = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
最后直接调用即可,运行的命令如下:
python run_classifier.py \
--task_name=emotion \
--do_train=true \
--do_eval=true \
--data_dir=data \ #把数据解压到同一级的文件夹中,此处是该文件夹名字data
--vocab_file=chinese_L-12_H-768_A-12/vocab.txt \ #中文数据要微调的原始bert模型
--bert_config_file=chinese_L-12_
1