Csi500 example (#1126)

you-n-g · web-flow · commit 58540f76eed4 · 2022-06-15T10:18:13.000+08:00
* Stage code

* Update results and scripts
diff --git a/examples/benchmarks/LightGBM/requirements.txt b/examples/benchmarks/LightGBM/requirements.txt
@@ -1,3 +1,3 @@
 pandas==1.1.2
 numpy==1.21.0
-lightgbm==3.1.0
+lightgbm
diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml
@@ -0,0 +1,72 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi500
+benchmark: &benchmark SH000905
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            model: <MODEL> 
+            dataset: <DATASET>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: LGBModel
+        module_path: qlib.contrib.model.gbdt
+        kwargs:
+            loss: mse
+            colsample_bytree: 0.8879
+            learning_rate: 0.2
+            subsample: 0.8789
+            lambda_l1: 205.6999
+            lambda_l2: 580.9768
+            max_depth: 8
+            num_leaves: 210
+            num_threads: 20
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml
@@ -0,0 +1,80 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi500
+benchmark: &benchmark SH000905
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+    infer_processors: []
+    learn_processors:
+        - class: DropnaLabel
+        - class: CSRankNorm
+          kwargs:
+              fields_group: label
+    label: ["Ref($close, -2) / Ref($close, -1) - 1"]
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            signal:
+                - <MODEL> 
+                - <DATASET>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: LGBModel
+        module_path: qlib.contrib.model.gbdt
+        kwargs:
+            loss: mse
+            colsample_bytree: 0.8879
+            learning_rate: 0.0421
+            subsample: 0.8789
+            lambda_l1: 205.6999
+            lambda_l2: 580.9768
+            max_depth: 8
+            num_leaves: 210
+            num_threads: 20
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha360
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md
@@ -20,7 +20,9 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
 > NOTE:
 > We have very limited resources to implement and finetune the models. We tried our best effort to fairly compare these models.  But some models may have greater potential than what it looks like in the table below.  Your contribution is highly welcomed to explore their potential.
 
-## Alpha158 dataset
+## Results on CSI300
+
+### Alpha158 dataset
 
 | Model Name                               | Dataset                             | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
 |------------------------------------------|-------------------------------------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
@@ -44,7 +46,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
 | DoubleEnsemble(Chuheng Zhang, et al.)    | Alpha158                            | 0.0544±0.00 | 0.4340±0.00 | 0.0523±0.00 | 0.4284±0.01 | 0.1168±0.01       | 1.3384±0.12       | -0.1036±0.01 |
 
 
-## Alpha360 dataset
+### Alpha360 dataset
 
 | Model Name                                | Dataset  | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
 |-------------------------------------------|----------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
@@ -79,6 +81,38 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
    - Signal-based evaluation:  IC, ICIR, Rank IC, Rank ICIR
    - Portfolio-based metrics:  Annualized Return, Information Ratio, Max Drawdown
 
+## Results on CSI500
+The results on CSI500 is not complete. PR's for models on csi500 are welcome!
+
+Transfer previous models in CSI300 to CSI500 is quite easy.  You can try models with just a few commands below.
+```
+cd examples/benchmarks/LightGBM
+pip install -r requirements.txt
+
+# create new config and set the benchmark to csi500
+cp workflow_config_lightgbm_Alpha158.yaml workflow_config_lightgbm_Alpha158_csi500.yaml
+sed -i "s/csi300/csi500/g"  workflow_config_lightgbm_Alpha158_csi500.yaml
+sed -i "s/SH000300/SH000905/g"  workflow_config_lightgbm_Alpha158_csi500.yaml
+
+# you can either run the model once
+qrun workflow_config_lightgbm_Alpha158_csi500.yaml
+
+# or run it for multiple times automatically and get the summarized results.
+cd  ../../
+python run_all_model.py run 3 lightgbm Alpha158 csi500  # for models with randomness.  please run it for 20 times.
+```
+
+### Alpha158 dataset
+
+| Model Name | Dataset  | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
+|------------|----------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
+| LightGBM   | Alpha158 | 0.0377±0.00 | 0.3860±0.00 | 0.0448±0.00 | 0.4675±0.00 | 0.1151±0.00       | 1.3884±0.00       | -0.0898±0.00 |
+
+### Alpha360 dataset
+| Model Name | Dataset  | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
+|------------|----------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
+| LightGBM   | Alpha360 | 0.0400±0.00 | 0.3605±0.00 | 0.0536±0.00 | 0.5431±0.00 | 0.0505±0.00       | 0.7658±0.02       | -0.1880±0.00 |
+
 
 # Contributing
 
diff --git a/examples/run_all_model.py b/examples/run_all_model.py
@@ -117,8 +117,10 @@ def get_all_folders(models, exclude) -> dict:
 
 
 # function to get all the files under the model folder
-def get_all_files(folder_path, dataset) -> (str, str):
-    yaml_path = str(Path(f"{folder_path}") / f"*{dataset}*.yaml")
+def get_all_files(folder_path, dataset, universe="") -> (str, str):
+    if universe != "":
+        universe = f"_{universe}"
+    yaml_path = str(Path(f"{folder_path}") / f"*{dataset}{universe}.yaml")
     req_path = str(Path(f"{folder_path}") / f"*.txt")
     yaml_file = glob.glob(yaml_path)
     req_file = glob.glob(req_path)
@@ -224,6 +226,7 @@ def run(
         times=1,
         models=None,
         dataset="Alpha360",
+        universe="",
         exclude=False,
         qlib_uri: str = "git+https://github.com/microsoft/qlib#egg=pyqlib",
         exp_folder_name: str = "run_all_model_records",
@@ -245,6 +248,9 @@ def run(
             determines whether the model being used is excluded or included.
         dataset : str
             determines the dataset to be used for each model.
+        universe  : str
+            the stock universe of the dataset.
+            default "" indicates that
         qlib_uri : str
             the uri to install qlib with pip
             it could be url on the we or local path (NOTE: the local path must be a absolute path)
@@ -259,6 +265,15 @@ def run(
         -------
         Here are some use cases of the function in the bash:
 
+        The run_all_models  will decide which config to run based no `models` `dataset`  `universe`
+        Example 1):
+
+            models="lightgbm", dataset="Alpha158", universe="" will result in running the following config
+            examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
+
+            models="lightgbm", dataset="Alpha158", universe="csi500" will result in running the following config
+            examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml
+
         .. code-block:: bash
 
             # Case 1 - run all models multiple times
@@ -279,6 +294,9 @@ def run(
             # Case 6 - run other models except those are given as arguments for one time
             python run_all_model.py run --models=[mlp,tft,sfm] --exclude=True
 
+            # Case 7 - run lightgbm model on csi500.
+            python run_all_model.py run 3 lightgbm Alpha158 csi500
+
         """
         self._init_qlib(exp_folder_name)
 
@@ -290,7 +308,7 @@ def run(
         for fn in folders:
             # get all files
             sys.stderr.write("Retrieving files...\n")
-            yaml_path, req_path = get_all_files(folders[fn], dataset)
+            yaml_path, req_path = get_all_files(folders[fn], dataset, universe=universe)
             if yaml_path is None:
                 sys.stderr.write(f"There is no {dataset}.yaml file in {folders[fn]}")
                 continue