feat: data improve, support parquet (#1966)

you-n-g · SunsetWolf · web-flow · commit 1b426503fc71 · 2025-08-07T15:04:37.000+08:00
* refactor: relocate CLI modules to qlib.cli and update references

* refactor: introduce read_as_df and rename csv_path to data_path

* lint

* refactor: rename csv_path to data_path and use QSettings.provider_uri

* fix pylint error

* fix get_data command

* add comments to CI yaml

* update docs

---------

Co-authored-by: Linlang &lt;Lv.Linlang@hotmail.com&gt;
diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml
@@ -60,6 +60,8 @@ jobs:
         brew unlink libomp
         brew install libomp.rb
 
+    # When the new version is released it should be changed to:
+    # python -m qlib.cli.data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
     - name: Downloads dependencies data
       run: |
         cd ..
diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml
@@ -104,7 +104,7 @@ jobs:
     - name: Test workflow by config (install from source)
       run: |
         python -m pip install numba
-        python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
+        python qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
 
     - name: Unit tests with Pytest
       uses: nick-fields/retry@v2
diff --git a/README.md b/README.md
@@ -229,10 +229,10 @@ Load and prepare data by running the following code:
 ### Get with module
   ```bash
   # get 1d data
-  python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
+  python -m qlib.cli.data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
 
   # get 1min data
-  python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --region cn --interval 1min
+  python -m qlib.cli.data qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --region cn --interval 1min
 
   ```
 
@@ -329,7 +329,7 @@ We recommend users to prepare their own data if they have a high-quality dataset
 3. At this point you are in the docker environment and can run the qlib scripts. An example:
     ```bash
     >>> python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
-    >>> python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
+    >>> python qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
     ```
 4. Exit the container
     ```bash
@@ -359,7 +359,7 @@ Qlib provides a tool named `qrun` to run the whole workflow automatically (inclu
     ```
     If users want to use `qrun` under debug mode, please use the following command:
     ```bash
-    python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
+    python -m pdb qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
     ```
     The result of `qrun` is as follows, please refer to [docs](https://qlib.readthedocs.io/en/latest/component/strategy.html#result) for more explanations about the result. 
 
diff --git a/docs/component/data.rst b/docs/component/data.rst
@@ -108,10 +108,10 @@ Automatic update of daily frequency data
 
 
 
-Converting CSV Format into Qlib Format
---------------------------------------
+Converting CSV and Parquet Format into Qlib Format
+--------------------------------------------------
 
-``Qlib`` has provided the script ``scripts/dump_bin.py`` to convert **any** data in CSV format into `.bin` files (``Qlib`` format) as long as they are in the correct format.
+``Qlib`` has provided the script ``scripts/dump_bin.py`` to convert **any** data in CSV or Parquet format into `.bin` files (``Qlib`` format) as long as they are in the correct format.
 
 Besides downloading the prepared demo data, users could download demo data directly from the Collector as follows for reference to the CSV format.
 Here are some example:
@@ -126,17 +126,17 @@ for 1min data:
 
     python scripts/data_collector/yahoo/collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --region CN --start 2021-05-20 --end 2021-05-23 --delay 0.1 --interval 1min --limit_nums 10
 
-Users can also provide their own data in CSV format. However, the CSV data **must satisfies** following criterions:
+Users can also provide their own data in CSV or Parquet format. However, the data **must satisfies** following criterions:
 
-- CSV file is named after a specific stock *or* the CSV file includes a column of the stock name
+- CSV or Parquet file is named after a specific stock *or* the CSV or Parquet file includes a column of the stock name
 
-    - Name the CSV file after a stock: `SH600000.csv`, `AAPL.csv` (not case sensitive).
+    - Name the CSV or Parquet file after a stock: `SH600000.csv`, `AAPL.csv` or `SH600000.parquet`, `AAPL.parquet` (not case sensitive).
 
-    - CSV file includes a column of the stock name. User **must** specify the column name when dumping the data. Here is an example:
+    - CSV or Parquet file includes a column of the stock name. User **must** specify the column name when dumping the data. Here is an example:
 
         .. code-block:: bash
 
-            python scripts/dump_bin.py dump_all ... --symbol_field_name symbol
+            python scripts/dump_bin.py dump_all ... --symbol_field_name symbol --file_suffix <.csv or .parquet>
 
         where the data are in the following format:
 
@@ -146,11 +146,11 @@ Users can also provide their own data in CSV format. However, the CSV data **mus
             | SH600000  | 120   |
             +-----------+-------+
 
-- CSV file **must** include a column for the date, and when dumping the data, user must specify the date column name. Here is an example:
+- CSV or Parquet file **must** include a column for the date, and when dumping the data, user must specify the date column name. Here is an example:
 
     .. code-block:: bash
 
-        python scripts/dump_bin.py dump_all ... --date_field_name date
+        python scripts/dump_bin.py dump_all ... --date_field_name date --file_suffix <.csv or .parquet>
 
     where the data are in the following format:
 
@@ -163,23 +163,23 @@ Users can also provide their own data in CSV format. However, the CSV data **mus
         +---------+------------+-------+------+----------+
 
 
-Supposed that users prepare their CSV format data in the directory ``~/.qlib/csv_data/my_data``, they can run the following command to start the conversion.
+Supposed that users prepare their CSV or Parquet format data in the directory ``~/.qlib/my_data``, they can run the following command to start the conversion.
 
 .. code-block:: bash
 
-    python scripts/dump_bin.py dump_all --csv_path  ~/.qlib/csv_data/my_data --qlib_dir ~/.qlib/qlib_data/my_data --include_fields open,close,high,low,volume,factor
+    python scripts/dump_bin.py dump_all --data_path  ~/.qlib/my_data --qlib_dir ~/.qlib/qlib_data/ --include_fields open,close,high,low,volume,factor --file_suffix <.csv or .parquet>
 
 For other supported parameters when dumping the data into `.bin` file, users can refer to the information by running the following commands:
 
 .. code-block:: bash
 
-    python dump_bin.py dump_all --help
+    python scripts/dump_bin.py dump_all --help
 
-After conversion, users can find their Qlib format data in the directory `~/.qlib/qlib_data/my_data`.
+After conversion, users can find their Qlib format data in the directory `~/.qlib/qlib_data/`.
 
 .. note::
 
-    The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
+    The arguments of `--include_fields` should correspond with the column names of CSV or Parquet files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
 
     - `open`
         The adjusted opening price
@@ -195,7 +195,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
         The Restoration factor. Normally, ``factor = adjusted_price / original_price``, `adjusted price` reference: `split adjusted <https://www.investopedia.com/terms/s/splitadjusted.asp>`_
 
     In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
-    If you want to use your own alpha-factor which can't be calculate by OCHLV, like PE, EPS and so on, you could add it to the CSV files with OHCLV together and then dump it to the Qlib format data.
+    If you want to use your own alpha-factor which can't be calculate by OCHLV, like PE, EPS and so on, you could add it to the CSV or Parquet files with OHCLV together and then dump it to the Qlib format data.
 
 Checking the health of the data
 -------------------------------
diff --git a/docs/component/workflow.rst b/docs/component/workflow.rst
@@ -110,7 +110,7 @@ If users want to use ``qrun`` under debug mode, please use the following command
 
 .. code-block:: bash
 
-    python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
+    python -m pdb qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
 
 .. note::
 
diff --git a/docs/developer/how_to_build_image.rst b/docs/developer/how_to_build_image.rst
@@ -52,7 +52,7 @@ How to use qlib images
 .. code-block:: bash
 
     >>> python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
-    >>> python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
+    >>> python qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
 
 3. Exit the container
 
diff --git a/examples/rl_order_execution/README.md b/examples/rl_order_execution/README.md
@@ -7,7 +7,7 @@ This folder comprises an example of Reinforcement Learning (RL) workflows for or
 ### Get Data
 
 ```
-python -m qlib.run.get_data qlib_data qlib_data --target_dir ./data/bin --region hs300 --interval 5min
+python -m qlib.cli.data qlib_data --target_dir ./data/bin --region hs300 --interval 5min
 ```
 
 ### Generate Pickle-Style Data
diff --git a/pyproject.toml b/pyproject.toml
@@ -103,4 +103,4 @@ packages = [
 ]
 
 [project.scripts]
-qrun = "qlib.workflow.cli:run"
+qrun = "qlib.cli.run:run"
diff --git a/qlib/backtest/exchange.py b/qlib/backtest/exchange.py
@@ -897,6 +897,7 @@ def _calc_trade_info_by_order(
             # if we don't know current position, we choose to sell all
             # Otherwise, we clip the amount based on current position
             if position is not None:
+                # TODO: make the trading shortable
                 current_amount = (
                     position.get_stock_amount(order.stock_id) if position.check_stock(order.stock_id) else 0
                 )
diff --git a/qlib/cli/__init__.py b/qlib/cli/__init__.py
diff --git a/qlib/cli/data.py b/qlib/cli/data.py
diff --git a/qlib/cli/run.py b/qlib/cli/run.py
@@ -87,7 +87,7 @@ def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
     """
     This is a Qlib CLI entrance.
     User can run the whole Quant research workflow defined by a configure file
-    - the code is located here ``qlib/workflow/cli.py`
+    - the code is located here ``qlib/cli/run.py`
 
     User can specify a base_config file in your workflow.yml file by adding "BASE_CONFIG_PATH".
     Qlib will load the configuration in BASE_CONFIG_PATH first, and the user only needs to update the custom fields
diff --git a/qlib/config.py b/qlib/config.py
@@ -49,6 +49,7 @@ class QSettings(BaseSettings):
     """
 
     mlflow: MLflowSettings = MLflowSettings()
+    provider_uri: str = "~/.qlib/qlib_data/cn_data"
 
     model_config = SettingsConfigDict(
         env_prefix="QLIB_",
@@ -261,7 +262,7 @@ def register_from_C(config, skip_register=True):
     },
     "client": {
         # config it in user's own code
-        "provider_uri": "~/.qlib/qlib_data/cn_data",
+        "provider_uri": QSETTINGS.provider_uri,
         # cache
         # Using parameter 'remote' to announce the client is using server_cache, and the writing access will be disabled.
         # Disable cache by default. Avoid introduce advanced features for beginners
diff --git a/scripts/data_collector/baostock_5min/README.md b/scripts/data_collector/baostock_5min/README.md
@@ -64,7 +64,7 @@
      This will convert the normalized csv in `feature` directory as numpy array and store the normalized data one file per column and one symbol per directory. 
     
      - parameters:
-       - `csv_path`: stock data path or directory, **normalize result(normalize_dir)**
+       - `data_path`: stock data path or directory, **normalize result(normalize_dir)**
        - `qlib_dir`: qlib(dump) data director
        - `freq`: transaction frequency, by default `day`
          > `freq_map = {1d:day, 5mih: 5min}`
@@ -74,8 +74,9 @@
          > dump_fields = `include_fields if include_fields else set(symbol_df.columns) - set(exclude_fields) exclude_fields else symbol_df.columns`
        - `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
        - `date_field_name`: column *name* identifying time in csv files, by default `date`
+       - `file_suffix`: stock data file format, by default ".csv"
      - examples:
        ```bash
        # dump 5min cn
-       python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/hs300_5min_nor --qlib_dir ~/.qlib/qlib_data/hs300_5min_bin --freq 5min --exclude_fields date,symbol
+       python dump_bin.py dump_all --data_path ~/.qlib/stock_data/source/hs300_5min_nor --qlib_dir ~/.qlib/qlib_data/hs300_5min_bin --freq 5min --exclude_fields date,symbol
        ```
diff --git a/scripts/data_collector/crypto/README.md b/scripts/data_collector/crypto/README.md
@@ -28,7 +28,7 @@ python collector.py normalize_data --source_dir ~/.qlib/crypto_data/source/1d --
 
 # dump data
 cd qlib/scripts
-python dump_bin.py dump_all --csv_path ~/.qlib/crypto_data/source/1d_nor --qlib_dir ~/.qlib/qlib_data/crypto_data --freq day --date_field_name date --include_fields prices,total_volumes,market_caps
+python dump_bin.py dump_all --data_path ~/.qlib/crypto_data/source/1d_nor --qlib_dir ~/.qlib/qlib_data/crypto_data --freq day --date_field_name date --include_fields prices,total_volumes,market_caps
 
 ```
 
diff --git a/scripts/data_collector/fund/README.md b/scripts/data_collector/fund/README.md
@@ -25,7 +25,7 @@ python collector.py normalize_data --source_dir ~/.qlib/fund_data/source/cn_data
 
 # dump data
 cd qlib/scripts
-python dump_bin.py dump_all --csv_path ~/.qlib/fund_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_fund_data --freq day --date_field_name FSRQ --include_fields DWJZ,LJJZ
+python dump_bin.py dump_all --data_path ~/.qlib/fund_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_fund_data --freq day --date_field_name FSRQ --include_fields DWJZ,LJJZ
 
 ```
 
diff --git a/scripts/data_collector/pit/README.md b/scripts/data_collector/pit/README.md
@@ -36,5 +36,5 @@ python collector.py normalize_data --interval quarterly --source_dir ~/.qlib/sto
 
 ```bash
 cd qlib/scripts
-python dump_pit.py dump --csv_path ~/.qlib/stock_data/source/pit_normalized --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly
+python dump_pit.py dump --data_path ~/.qlib/stock_data/source/pit_normalized --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly
 ```
diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md
@@ -139,7 +139,7 @@ pip install -r requirements.txt
      This will convert the normalized csv in `feature` directory as numpy array and store the normalized data one file per column and one symbol per directory. 
     
      - parameters:
-       - `csv_path`: stock data path or directory, **normalize result(normalize_dir)**
+       - `data_path`: stock data path or directory, **normalize result(normalize_dir)**
        - `qlib_dir`: qlib(dump) data director
        - `freq`: transaction frequency, by default `day`
          > `freq_map = {1d:day, 1mih: 1min}`
@@ -149,12 +149,13 @@ pip install -r requirements.txt
          > dump_fields = `include_fields if include_fields else set(symbol_df.columns) - set(exclude_fields) exclude_fields else symbol_df.columns`
        - `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
        - `date_field_name`: column *name* identifying time in csv files, by default `date`
+       - `file_suffix`: stock data file format, by default ".csv"
      - examples:
        ```bash
        # dump 1d cn
-       python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_data --freq day --exclude_fields date,symbol
+       python dump_bin.py dump_all --data_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_data --freq day --exclude_fields date,symbol --file_suffix .csv
        # dump 1min cn
-       python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/cn_data_1min --freq 1min --exclude_fields date,symbol
+       python dump_bin.py dump_all --data_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/cn_data_1min --freq 1min --exclude_fields date,symbol --file_suffix .csv
        ```
 
 ### Automatic update of daily frequency data(from yahoo finance)
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
@@ -856,7 +856,7 @@ def normalize_data_1d_extend(
 
                 3. normalize new source data(from step 2): python scripts/data_collector/yahoo/collector.py normalize_data_1d_extend --old_qlib_dir <dir1> --source_dir <dir2> --normalize_dir <dir3> --region CN --interval 1d
 
-                4. dump data: python scripts/dump_bin.py dump_update --csv_path <dir3> --qlib_dir <dir1> --freq day --date_field_name date --symbol_field_name symbol --exclude_fields symbol,date
+                4. dump data: python scripts/dump_bin.py dump_update --data_path <dir3> --qlib_dir <dir1> --freq day --date_field_name date --symbol_field_name symbol --exclude_fields symbol,date
 
                 5. update instrument(eg. csi300): python python scripts/data_collector/cn_index/collector.py --index_name CSI300 --qlib_dir <dir1> --method parse_instruments
 
@@ -997,7 +997,7 @@ def update_data_to_bin(
 
         # dump bin
         _dump = DumpDataUpdate(
-            csv_path=self.normalize_dir,
+            data_path=self.normalize_dir,
             qlib_dir=qlib_data_1d_dir,
             exclude_fields="symbol,date",
             max_workers=self.max_workers,
diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py
diff --git a/tests/test_dump_data.py b/tests/test_dump_data.py

Original file line number	Diff line number	Diff line change
`@@ -103,4 +103,4 @@ packages = [`
`103`	`103`	`]`
`104`	`104`
`105`	`105`	`[project.scripts]`
`106`		`-qrun = "qlib.workflow.cli:run"`
	`106`	`+qrun = "qlib.cli.run:run"`
Original file line number	Diff line number	Diff line change
`@@ -897,6 +897,7 @@ def _calc_trade_info_by_order(`
`897`	`897`	`# if we don't know current position, we choose to sell all`
`898`	`898`	`# Otherwise, we clip the amount based on current position`
`899`	`899`	`if position is not None:`
	`900`	`+ # TODO: make the trading shortable`
`900`	`901`	`current_amount = (`
`901`	`902`	`position.get_stock_amount(order.stock_id) if position.check_stock(order.stock_id) else 0`
`902`	`903`	`)`