-
Notifications
You must be signed in to change notification settings - Fork 1
Description
I'd like to train a Chinse model. I converted Ontonotes 5 Chinese corpus into jsonlines files as below:
{"doc_key": "bc_cctv_00_cctv_0000", "sentences": [["EMPTY"], ["二零零五年", "的", "夏天", ",", "一", "个", "被", "人们", "期待", "已", "久", "的", "画面", "开始", "在", "香港", "的", "各", "大", "媒体", "频繁", "出现", ","], ["这些", "被", "人们", "所", "熟知", "的", "卡通", "形象", "以", "其", "独有", "的", "魅力", "再", "一", "次", "让", "世人", "的", "目光", "聚集", "到", "香港", ","], ["全球", "第五", "个", "迪斯尼", "乐园", "即将", "在", "这里", "向", "公众", "开放", "。"], ["迪斯尼", "呢", "最", "重要", "的", "是", ",", "它", "是", "世界", "的", "品牌", "嘛", "。"], ["那么", "这", "几", "年", "呐", ",", "虽然", "它", "建造", "的", "时候", ",", "呃", "还", ",", "还", "没有", "开幕", "呢", ",", "已经", "有", "很多", "的", "人", "对", "香港", "啊", ",", "可以", "说", "是", "另眼相看", "呐", "。"], ["<", "English", ">", "Then", "welcome", "to", "the", "official", "writing", "ceremony", "of", "Hongkong", "DiskneyLand", "<", "English", ">", "."], ["香港", "迪斯尼", "乐园", "的", "建设", "开始", "于", "两", "年", "前", "的", "二零零三年", ","], ["这", "年", "一月", ",", "香港", "政府", "将", "大屿山", "下", "的", "这", "片", "近年", "来", "最", "大", "的", "填海", "工程", "所", "得到", "的", "二百", "公顷", "土地", ",", "交给", "了", "迪斯尼", "公司", "。"], ["<", "English", ">", "One", "<", "English", ">", "."], ["EMPTY"], ["从", "那时", "开始", "这里", "就", "成", "了", "香港", "的", "一", "个", "禁区", "。"], ["同", "在", "大屿山", "与", "之", "相邻", "的", "香港", "国际", "机场", ",", "调整", "了", "航线", ","], ["使", "这里", "成为", "一", "个", "禁飞区", "。"], ["第一", "次", "在", "中国", "土地", "上", "落户", "的", "米奇", "老鼠", "的", "新", "家", ",", "引起", "了", "全球", "的", "关注", "。"], ["EMPTY"], ["现在", "距离", "香港", "迪斯尼", "乐园", "九月", "十二号", "的", "开业", "只", "有", "一", "个", "月", "的", "时间", "了", ","], ["通往", "迪斯尼", "的", "地铁", "也", "已经", "建好", "。"], ["地铁站", "里", "不时", "会", "有", "乘客", "在", "售票机", "上", "点击", "迪斯尼", "一", "站", ","], ["试图", "买", "票", "去", "先睹为快", "。"], ["但是", "迪斯尼", "地铁站", "的", "开通日", "却", "被", "定在", "开业", "当天", "。"], ["两", "年", "来", ",", "迪斯尼", "一直", "保持", "着", "它", "的", "神秘", ","], ["没有", "任何", "一", "家", "媒体", "被", "允许", "进入", "拍摄", "。"], ["我们", "乘坐", "出租车", "沿着", "通往", "迪斯尼", "方向", "的", "公路", "一路", "向前", ","], ["试图", "近距离", "去", "开始", "于", "两", "年", "前", "的", "二零零三年", ","], ["但是", "在", "迪斯尼", "的", "任何", "标志", "都", "还", "没有", "进入", "我们", "的", "视线", "时", ",", "车子", "就", "在", "去往", "迪斯尼", "的", "岔路口", "被", "保安", "人员", "拦", "了", "下来", ","], ["回来", "的", "路", "上", ",", "出租车", "司机", "在", "了解", "我们", "的", "意图", "后", ",", "给", "我们", "做", "了", "这样", "的", "解释", "。"], ["<", "Cantonese", ">", "呃", "据", "保安", "说", "是", "全部", "暂时", "未", "正式", "开", "<", "Cantonese", ">"], ["<", "Cantonese", ">", "开放", "之前", "呢", "就", "全部", ",", "任何", "车辆", ",", "除了", "特别", "有", "批准", "之外", "呢", "才", "可以", "进入", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "如果", "不", "是", "的话", "全部", "都", "不", "可以", "进入", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "尤其是", "不", "可以", "摄录机", "拍摄", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "啊", ",", "全部", "是", "高度", "机密", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "如果", "是", "啊", ",", "未", "批准", "拍摄", "呢", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "可", "分", ",", "就", "是", "随时", "呢", ",", "就", ",", "就", "接受", "法律", "追究", "的", ",", "很", "麻烦", "的", "<", "/", "Cantonese", ">"], ["迪斯尼", "公司", "虽然", "把", "中国", "的", "迪斯尼", "公园", "选址", "在", "香港", ",", "但是", "最", "让", "他们", "心动", "的", "却", "是", "中国", "内地", "的", "游客", "市场", "。"], ["自从", "香港", "和", "内地", "开通", "自由", "行", "后", ",", "来", "香港", "旅游", "的", "内地", "旅客", "越来越", "多", "。"], ["开始", "到", "现在", "啊", ",", "已经", "有", ",", "已经", "有", "七百多万", "的", "个人游", "的", "旅客", "来", "香港", "了", ","], ["那么", "现在", "呢", "我们", "呃", "相信", "哪", "这个", "是", "会", "越来越", "多", "啦", ","], ["现在", "差不多", "两", "年", "了", "嘛", ","], ["还有", "现在", "三十四", "个", "城市", "呢", "会", "增加", "的", "。"], ["香港", "是", "由", "百", "年", "前", "的", "一", "个", "渔港", "发展", "成", "今天", "的", "国际", "大", "都会", ","], ["这里", "东", "西", "方", "文化", "荟萃", ",", "新", "旧", "事物", "交织", "共", "融", "。"], ["来到", "香港", ",", "你", "可以", "在", "高楼", "大厦", "间", "穿梭", "闲逛", ","], ["于", "商场", "名店", "中", "尽情", "地", "搜购", "来自", "各", "国", "的", "商品", "。"], ["在", "茶餐厅", "或者", "旺角", "的", "街头", "品尝", "来自", "世界", "各", "地", "的", "美食", "小吃", "。"], ["EMPTY"], ["来到", "汇聚", "了", "各", "路", "神仙", "的", "浅水湾", "烧香", "许愿", ","], ["感受", "香港", "最", "有", "魅力", "的", "阳光", "沙滩", "。"], ["EMPTY"], ["登上", "太平山", "顶", ",", "将", "香港岛", "和", "维多利亚湾", "的", "美丽", "风光", "尽收眼底", "。"], ["EMPTY"]], "mention_clusters": [[[1, 15, 16], [2, 22, 23], [3, 7, 8], [5, 26, 27], [6, 11, 12], [7, 0, 1], [8, 4, 5], [11, 7, 8], [16, 2, 3], [34, 10, 11], [35, 1, 2], [35, 10, 11], [36, 16, 17], [40, 0, 1], [41, 0, 1], [42, 1, 2], [47, 1, 2]], [[2, 9, 10]], [[4, 0, 1], [4, 7, 8]], [[5, 7, 8], [6, 11, 13], [7, 0, 3], [14, 12, 13], [16, 2, 5], [17, 1, 2], [18, 10, 11], [20, 1, 2], [21, 4, 5], [21, 8, 9], [23, 5, 6], [25, 2, 3], [25, 19, 20], [34, 4, 8]], [[8, 7, 8], [12, 2, 3]], [[8, 7, 25], [11, 3, 4], [12, 4, 5], [13, 1, 2]], [[11, 1, 2]], [[14, 3, 4], [34, 4, 5], [34, 20, 21]], [[18, 10, 13], [20, 1, 3]], [[20, 1, 5]], [[23, 0, 1], [25, 10, 11], [26, 9, 10], [26, 15, 16]], [[23, 2, 3], [25, 15, 16]], [[27, 5, 6]], [[34, 15, 16]], [[36, 2, 3], [37, 1, 2], [38, 0, 1], [39, 1, 2]], [[37, 7, 8]]]}
But when I ran python xcore/train.py, it showed errors as below:
logging:
log: true
wandb_arg:
_target_: pytorch_lightning.loggers.WandbLogger
name: ${train.model_name}
project: ${train.project_name}
save_dir: ./
log_model: false
mode: online
watch:
log: all
log_freq: 100
Seed set to 30
[10:32:26] Starting training for xcore/IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese model train.py:34
Instantiating the Data Module train.py:50
/home/mike/xcore/xcore/common/util.py:50: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
df = pd.read_json(hydra.utils.get_original_cwd() + "/" + file_path, lines=True)
Error executing job with overrides: []
Traceback (most recent call last):
File "/home/mike/xcore/xcore/data/datasets.py", line 34, in __init__
self.set = load_from_disk(hydra.utils.get_original_cwd() + "/" + processed_dataset_path + "_" + type + "/")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/datasets/load.py", line 1566, in load_from_disk
raise FileNotFoundError(f"Directory {dataset_path} not found")
FileNotFoundError: Directory /home/mike/xcore/data/cache_1770/IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese/ontonotes/val_book/ not found
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 92, in _call_target
return _target_(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/mike/xcore/xcore/data/datasets.py", line 37, in __init__
self.set = dt.from_pandas(util.ontonotes_to_dataframe(path))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/mike/xcore/xcore/common/util.py", line 50, in ontonotes_to_dataframe
df = pd.read_json(hydra.utils.get_original_cwd() + "/" + file_path, lines=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 815, in read_json
return json_reader.read()
^^^^^^^^^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 1012, in read
obj = self._get_object_parser(self._combine_lines(data_lines))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 1040, in _get_object_parser
obj = FrameParser(json, **kwargs).parse()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 1176, in parse
self._parse()
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 1392, in _parse
ujson_loads(json, precise_float=self.precise_float), dtype=None
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: Expected object or value
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/mike/xcore/xcore/train.py", line 120, in <module>
main()
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/main.py", line 90, in decorated_main
_run_hydra(
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 222, in run_and_report
raise ex
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 219, in run_and_report
return func()
^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
lambda: hydra.run(
^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
^^^^^^^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/mike/xcore/xcore/train.py", line 116, in main
train(conf)
File "/home/mike/xcore/xcore/train.py", line 55, in train
pl_data_module.setup("fit")
File "/home/mike/xcore/xcore/data/pl_data_modules.py", line 34, in setup
self.val_dataset = hydra.utils.instantiate(self.dataset.val)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 226, in instantiate
return instantiate_node(
^^^^^^^^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 347, in instantiate_node
return _call_target(_target_, partial, args, kwargs, full_key)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 97, in _call_target
raise InstantiationException(msg) from e
hydra.errors.InstantiationException: Error in call to target 'xcore.data.datasets.xcoreDataset':
ValueError('Expected object or value')
full_key: dataset.val
My Python: Miniconda 3.12