Skip to content

What's the format of the training corpus? #2

@fishfree

Description

@fishfree

I'd like to train a Chinse model. I converted Ontonotes 5 Chinese corpus into jsonlines files as below:
{"doc_key": "bc_cctv_00_cctv_0000", "sentences": [["EMPTY"], ["二零零五年", "的", "夏天", ",", "一", "个", "被", "人们", "期待", "已", "久", "的", "画面", "开始", "在", "香港", "的", "各", "大", "媒体", "频繁", "出现", ","], ["这些", "被", "人们", "所", "熟知", "的", "卡通", "形象", "以", "其", "独有", "的", "魅力", "再", "一", "次", "让", "世人", "的", "目光", "聚集", "到", "香港", ","], ["全球", "第五", "个", "迪斯尼", "乐园", "即将", "在", "这里", "向", "公众", "开放", "。"], ["迪斯尼", "呢", "最", "重要", "的", "是", ",", "它", "是", "世界", "的", "品牌", "嘛", "。"], ["那么", "这", "几", "年", "呐", ",", "虽然", "它", "建造", "的", "时候", ",", "呃", "还", ",", "还", "没有", "开幕", "呢", ",", "已经", "有", "很多", "的", "人", "对", "香港", "啊", ",", "可以", "说", "是", "另眼相看", "呐", "。"], ["<", "English", ">", "Then", "welcome", "to", "the", "official", "writing", "ceremony", "of", "Hongkong", "DiskneyLand", "<", "English", ">", "."], ["香港", "迪斯尼", "乐园", "的", "建设", "开始", "于", "两", "年", "前", "的", "二零零三年", ","], ["这", "年", "一月", ",", "香港", "政府", "将", "大屿山", "下", "的", "这", "片", "近年", "来", "最", "大", "的", "填海", "工程", "所", "得到", "的", "二百", "公顷", "土地", ",", "交给", "了", "迪斯尼", "公司", "。"], ["<", "English", ">", "One", "<", "English", ">", "."], ["EMPTY"], ["从", "那时", "开始", "这里", "就", "成", "了", "香港", "的", "一", "个", "禁区", "。"], ["同", "在", "大屿山", "与", "之", "相邻", "的", "香港", "国际", "机场", ",", "调整", "了", "航线", ","], ["使", "这里", "成为", "一", "个", "禁飞区", "。"], ["第一", "次", "在", "中国", "土地", "上", "落户", "的", "米奇", "老鼠", "的", "新", "家", ",", "引起", "了", "全球", "的", "关注", "。"], ["EMPTY"], ["现在", "距离", "香港", "迪斯尼", "乐园", "九月", "十二号", "的", "开业", "只", "有", "一", "个", "月", "的", "时间", "了", ","], ["通往", "迪斯尼", "的", "地铁", "也", "已经", "建好", "。"], ["地铁站", "里", "不时", "会", "有", "乘客", "在", "售票机", "上", "点击", "迪斯尼", "一", "站", ","], ["试图", "买", "票", "去", "先睹为快", "。"], ["但是", "迪斯尼", "地铁站", "的", "开通日", "却", "被", "定在", "开业", "当天", "。"], ["两", "年", "来", ",", "迪斯尼", "一直", "保持", "着", "它", "的", "神秘", ","], ["没有", "任何", "一", "家", "媒体", "被", "允许", "进入", "拍摄", "。"], ["我们", "乘坐", "出租车", "沿着", "通往", "迪斯尼", "方向", "的", "公路", "一路", "向前", ","], ["试图", "近距离", "去", "开始", "于", "两", "年", "前", "的", "二零零三年", ","], ["但是", "在", "迪斯尼", "的", "任何", "标志", "都", "还", "没有", "进入", "我们", "的", "视线", "时", ",", "车子", "就", "在", "去往", "迪斯尼", "的", "岔路口", "被", "保安", "人员", "拦", "了", "下来", ","], ["回来", "的", "路", "上", ",", "出租车", "司机", "在", "了解", "我们", "的", "意图", "后", ",", "给", "我们", "做", "了", "这样", "的", "解释", "。"], ["<", "Cantonese", ">", "呃", "据", "保安", "说", "是", "全部", "暂时", "未", "正式", "开", "<", "Cantonese", ">"], ["<", "Cantonese", ">", "开放", "之前", "呢", "就", "全部", ",", "任何", "车辆", ",", "除了", "特别", "有", "批准", "之外", "呢", "才", "可以", "进入", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "如果", "不", "是", "的话", "全部", "都", "不", "可以", "进入", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "尤其是", "不", "可以", "摄录机", "拍摄", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "啊", ",", "全部", "是", "高度", "机密", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "如果", "是", "啊", ",", "未", "批准", "拍摄", "呢", "<", "/", "Cantonese", ">"], ["<", "Cantonese", ">", "可", "分", ",", "就", "是", "随时", "呢", ",", "就", ",", "就", "接受", "法律", "追究", "的", ",", "很", "麻烦", "的", "<", "/", "Cantonese", ">"], ["迪斯尼", "公司", "虽然", "把", "中国", "的", "迪斯尼", "公园", "选址", "在", "香港", ",", "但是", "最", "让", "他们", "心动", "的", "却", "是", "中国", "内地", "的", "游客", "市场", "。"], ["自从", "香港", "和", "内地", "开通", "自由", "行", "后", ",", "来", "香港", "旅游", "的", "内地", "旅客", "越来越", "多", "。"], ["开始", "到", "现在", "啊", ",", "已经", "有", ",", "已经", "有", "七百多万", "的", "个人游", "的", "旅客", "来", "香港", "了", ","], ["那么", "现在", "呢", "我们", "呃", "相信", "哪", "这个", "是", "会", "越来越", "多", "啦", ","], ["现在", "差不多", "两", "年", "了", "嘛", ","], ["还有", "现在", "三十四", "个", "城市", "呢", "会", "增加", "的", "。"], ["香港", "是", "由", "百", "年", "前", "的", "一", "个", "渔港", "发展", "成", "今天", "的", "国际", "大", "都会", ","], ["这里", "东", "西", "方", "文化", "荟萃", ",", "新", "旧", "事物", "交织", "共", "融", "。"], ["来到", "香港", ",", "你", "可以", "在", "高楼", "大厦", "间", "穿梭", "闲逛", ","], ["于", "商场", "名店", "中", "尽情", "地", "搜购", "来自", "各", "国", "的", "商品", "。"], ["在", "茶餐厅", "或者", "旺角", "的", "街头", "品尝", "来自", "世界", "各", "地", "的", "美食", "小吃", "。"], ["EMPTY"], ["来到", "汇聚", "了", "各", "路", "神仙", "的", "浅水湾", "烧香", "许愿", ","], ["感受", "香港", "最", "有", "魅力", "的", "阳光", "沙滩", "。"], ["EMPTY"], ["登上", "太平山", "顶", ",", "将", "香港岛", "和", "维多利亚湾", "的", "美丽", "风光", "尽收眼底", "。"], ["EMPTY"]], "mention_clusters": [[[1, 15, 16], [2, 22, 23], [3, 7, 8], [5, 26, 27], [6, 11, 12], [7, 0, 1], [8, 4, 5], [11, 7, 8], [16, 2, 3], [34, 10, 11], [35, 1, 2], [35, 10, 11], [36, 16, 17], [40, 0, 1], [41, 0, 1], [42, 1, 2], [47, 1, 2]], [[2, 9, 10]], [[4, 0, 1], [4, 7, 8]], [[5, 7, 8], [6, 11, 13], [7, 0, 3], [14, 12, 13], [16, 2, 5], [17, 1, 2], [18, 10, 11], [20, 1, 2], [21, 4, 5], [21, 8, 9], [23, 5, 6], [25, 2, 3], [25, 19, 20], [34, 4, 8]], [[8, 7, 8], [12, 2, 3]], [[8, 7, 25], [11, 3, 4], [12, 4, 5], [13, 1, 2]], [[11, 1, 2]], [[14, 3, 4], [34, 4, 5], [34, 20, 21]], [[18, 10, 13], [20, 1, 3]], [[20, 1, 5]], [[23, 0, 1], [25, 10, 11], [26, 9, 10], [26, 15, 16]], [[23, 2, 3], [25, 15, 16]], [[27, 5, 6]], [[34, 15, 16]], [[36, 2, 3], [37, 1, 2], [38, 0, 1], [39, 1, 2]], [[37, 7, 8]]]}

But when I ran python xcore/train.py, it showed errors as below:

logging:
  log: true
  wandb_arg:
    _target_: pytorch_lightning.loggers.WandbLogger
    name: ${train.model_name}
    project: ${train.project_name}
    save_dir: ./
    log_model: false
    mode: online
  watch:
    log: all
    log_freq: 100

Seed set to 30
[10:32:26] Starting training for xcore/IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese model                                                                                   train.py:34
           Instantiating the Data Module                                                                                                                                   train.py:50
/home/mike/xcore/xcore/common/util.py:50: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  df = pd.read_json(hydra.utils.get_original_cwd() + "/" + file_path, lines=True)
Error executing job with overrides: []
Traceback (most recent call last):
  File "/home/mike/xcore/xcore/data/datasets.py", line 34, in __init__
    self.set = load_from_disk(hydra.utils.get_original_cwd() + "/" + processed_dataset_path + "_" + type + "/")
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/datasets/load.py", line 1566, in load_from_disk
    raise FileNotFoundError(f"Directory {dataset_path} not found")
FileNotFoundError: Directory /home/mike/xcore/data/cache_1770/IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese/ontonotes/val_book/ not found

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 92, in _call_target
    return _target_(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/xcore/xcore/data/datasets.py", line 37, in __init__
    self.set = dt.from_pandas(util.ontonotes_to_dataframe(path))
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/xcore/xcore/common/util.py", line 50, in ontonotes_to_dataframe
    df = pd.read_json(hydra.utils.get_original_cwd() + "/" + file_path, lines=True)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 815, in read_json
    return json_reader.read()
           ^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 1012, in read
    obj = self._get_object_parser(self._combine_lines(data_lines))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 1040, in _get_object_parser
    obj = FrameParser(json, **kwargs).parse()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 1176, in parse
    self._parse()
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/pandas/io/json/_json.py", line 1392, in _parse
    ujson_loads(json, precise_float=self.precise_float), dtype=None
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: Expected object or value

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/mike/xcore/xcore/train.py", line 120, in <module>
    main()
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/main.py", line 90, in decorated_main
    _run_hydra(
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
    _run_app(
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 457, in _run_app
    run_and_report(
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 222, in run_and_report
    raise ex
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 219, in run_and_report
    return func()
           ^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
    lambda: hydra.run(
            ^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/hydra.py", line 132, in run
    _ = ret.return_value
        ^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/core/utils.py", line 260, in return_value
    raise self._return_value
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/core/utils.py", line 186, in run_job
    ret.return_value = task_function(task_cfg)
                       ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/xcore/xcore/train.py", line 116, in main
    train(conf)
  File "/home/mike/xcore/xcore/train.py", line 55, in train
    pl_data_module.setup("fit")
  File "/home/mike/xcore/xcore/data/pl_data_modules.py", line 34, in setup
    self.val_dataset = hydra.utils.instantiate(self.dataset.val)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 226, in instantiate
    return instantiate_node(
           ^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 347, in instantiate_node
    return _call_target(_target_, partial, args, kwargs, full_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/xcore/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 97, in _call_target
    raise InstantiationException(msg) from e
hydra.errors.InstantiationException: Error in call to target 'xcore.data.datasets.xcoreDataset':
ValueError('Expected object or value')
full_key: dataset.val

My Python: Miniconda 3.12

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions