22# Licensed under the MIT License.
33
44# coding=utf-8
5+ from abc import abstractmethod
56import warnings
67from typing import Callable , Union , Tuple , List , Iterator , Optional
78
1920from . import loader as data_loader_module
2021
2122
22- # TODO: A more general handler interface which does not relies on internal pd.DataFrame is needed.
23- class DataHandler (Serializable ):
23+ DATA_KEY_TYPE = Literal ["raw" , "infer" , "learn" ]
24+
25+
26+ class DataHandlerABC (Serializable ):
27+ """
28+ Interface for data handler.
29+
30+ This class does not assume the internal data structure of the data handler.
31+ It only defines the interface for external users (uses DataFrame as the internal data structure).
32+
33+ In the future, the data handler's more detailed implementation should be refactored. Here are some guidelines:
34+
35+ It covers several components:
36+
37+ - [data loader] -> internal representation of the data -> data preprocessing -> interface adaptor for the fetch interface
38+ - The workflow to combine them all:
39+ The workflow may be very complicated. DataHandlerLP is one of the practices, but it can't satisfy all the requirements.
40+ So leaving the flexibility to the user to implement the workflow is a more reasonable choice.
41+ """
42+
43+ def __init__ (self , * args , ** kwargs ):
44+ """
45+ We should define how to get ready for the fetching.
46+ """
47+ super ().__init__ (* args , ** kwargs )
48+
49+ CS_ALL = "__all" # return all columns with single-level index column
50+ CS_RAW = "__raw" # return raw data with multi-level index column
51+
52+ # data key
53+ DK_R : DATA_KEY_TYPE = "raw"
54+ DK_I : DATA_KEY_TYPE = "infer"
55+ DK_L : DATA_KEY_TYPE = "learn"
56+
57+ @abstractmethod
58+ def fetch (
59+ self ,
60+ selector : Union [pd .Timestamp , slice , str , pd .Index ] = slice (None , None ),
61+ level : Union [str , int ] = "datetime" ,
62+ col_set : Union [str , List [str ]] = CS_ALL ,
63+ data_key : DATA_KEY_TYPE = DK_I ,
64+ ) -> pd .DataFrame :
65+ pass
66+
67+
68+ class DataHandler (DataHandlerABC ):
2469 """
70+ The motivation of DataHandler:
71+
72+ - It provides an implementation of BaseDataHandler that we implement with:
73+ - Handling responses with an internal loaded DataFrame
74+ - The DataFrame is loaded by a data loader.
75+
2576 The steps to using a handler
2677 1. initialized data handler (call by `init`).
2778 2. use the data.
@@ -144,16 +195,14 @@ def setup_data(self, enable_cache: bool = False):
144195 self ._data = lazy_sort_index (self .data_loader .load (self .instruments , self .start_time , self .end_time ))
145196 # TODO: cache
146197
147- CS_ALL = "__all" # return all columns with single-level index column
148- CS_RAW = "__raw" # return raw data with multi-level index column
149-
150198 def fetch (
151199 self ,
152200 selector : Union [pd .Timestamp , slice , str , pd .Index ] = slice (None , None ),
153201 level : Union [str , int ] = "datetime" ,
154- col_set : Union [str , List [str ]] = CS_ALL ,
202+ col_set : Union [str , List [str ]] = DataHandlerABC .CS_ALL ,
203+ data_key : DATA_KEY_TYPE = DataHandlerABC .DK_I ,
155204 squeeze : bool = False ,
156- proc_func : Callable = None ,
205+ proc_func : Optional [ Callable ] = None ,
157206 ) -> pd .DataFrame :
158207 """
159208 fetch data from underlying data source
@@ -216,6 +265,8 @@ def fetch(
216265 -------
217266 pd.DataFrame.
218267 """
268+ # DataHandler is an example with only one dataframe, so data_key is not used.
269+ _ = data_key # avoid linting errors (e.g., unused-argument)
219270 return self ._fetch_data (
220271 data_storage = self ._data ,
221272 selector = selector ,
@@ -230,7 +281,7 @@ def _fetch_data(
230281 data_storage ,
231282 selector : Union [pd .Timestamp , slice , str , pd .Index ] = slice (None , None ),
232283 level : Union [str , int ] = "datetime" ,
233- col_set : Union [str , List [str ]] = CS_ALL ,
284+ col_set : Union [str , List [str ]] = DataHandlerABC . CS_ALL ,
234285 squeeze : bool = False ,
235286 proc_func : Callable = None ,
236287 ):
@@ -261,16 +312,9 @@ def _fetch_data(
261312 data_df = fetch_df_by_col (data_df , col_set )
262313 data_df = fetch_df_by_index (data_df , selector , level , fetch_orig = self .fetch_orig )
263314 elif isinstance (data_storage , BaseHandlerStorage ):
264- if not data_storage .is_proc_func_supported ():
265- if proc_func is not None :
266- raise ValueError (f"proc_func is not supported by the storage { type (data_storage )} " )
267- data_df = data_storage .fetch (
268- selector = selector , level = level , col_set = col_set , fetch_orig = self .fetch_orig
269- )
270- else :
271- data_df = data_storage .fetch (
272- selector = selector , level = level , col_set = col_set , fetch_orig = self .fetch_orig , proc_func = proc_func
273- )
315+ if proc_func is not None :
316+ raise ValueError (f"proc_func is not supported by the storage { type (data_storage )} " )
317+ data_df = data_storage .fetch (selector = selector , level = level , col_set = col_set , fetch_orig = self .fetch_orig )
274318 else :
275319 raise TypeError (f"data_storage should be pd.DataFrame|HashingStockStorage, not { type (data_storage )} " )
276320
@@ -282,7 +326,7 @@ def _fetch_data(
282326 data_df = data_df .reset_index (level = level , drop = True )
283327 return data_df
284328
285- def get_cols (self , col_set = CS_ALL ) -> list :
329+ def get_cols (self , col_set = DataHandlerABC . CS_ALL ) -> list :
286330 """
287331 get the column names
288332
@@ -336,11 +380,12 @@ def get_range_iterator(
336380 yield cur_date , self .fetch (selector , ** kwargs )
337381
338382
339- DATA_KEY_TYPE = Literal ["raw" , "infer" , "learn" ]
340-
341-
342383class DataHandlerLP (DataHandler ):
343384 """
385+ Motivation:
386+ - For the case that we hope using different processor workflows for learning and inference;
387+
388+
344389 DataHandler with **(L)earnable (P)rocessor**
345390
346391 This handler will produce three pieces of data in pd.DataFrame format.
@@ -374,12 +419,8 @@ class DataHandlerLP(DataHandler):
374419 _infer : pd .DataFrame # data for inference
375420 _learn : pd .DataFrame # data for learning models
376421
377- # data key
378- DK_R : DATA_KEY_TYPE = "raw"
379- DK_I : DATA_KEY_TYPE = "infer"
380- DK_L : DATA_KEY_TYPE = "learn"
381422 # map data_key to attribute name
382- ATTR_MAP = {DK_R : "_data" , DK_I : "_infer" , DK_L : "_learn" }
423+ ATTR_MAP = {DataHandler . DK_R : "_data" , DataHandler . DK_I : "_infer" , DataHandler . DK_L : "_learn" }
383424
384425 # process type
385426 PTYPE_I = "independent"
@@ -622,7 +663,7 @@ def setup_data(self, init_type: str = IT_FIT_SEQ, **kwargs):
622663
623664 # TODO: Be able to cache handler data. Save the memory for data processing
624665
625- def _get_df_by_key (self , data_key : DATA_KEY_TYPE = DK_I ) -> pd .DataFrame :
666+ def _get_df_by_key (self , data_key : DATA_KEY_TYPE = DataHandlerABC . DK_I ) -> pd .DataFrame :
626667 if data_key == self .DK_R and self .drop_raw :
627668 raise AttributeError (
628669 "DataHandlerLP has not attribute _data, please set drop_raw = False if you want to use raw data"
@@ -635,7 +676,7 @@ def fetch(
635676 selector : Union [pd .Timestamp , slice , str ] = slice (None , None ),
636677 level : Union [str , int ] = "datetime" ,
637678 col_set = DataHandler .CS_ALL ,
638- data_key : DATA_KEY_TYPE = DK_I ,
679+ data_key : DATA_KEY_TYPE = DataHandler . DK_I ,
639680 squeeze : bool = False ,
640681 proc_func : Callable = None ,
641682 ) -> pd .DataFrame :
@@ -669,7 +710,7 @@ def fetch(
669710 proc_func = proc_func ,
670711 )
671712
672- def get_cols (self , col_set = DataHandler .CS_ALL , data_key : DATA_KEY_TYPE = DK_I ) -> list :
713+ def get_cols (self , col_set = DataHandler .CS_ALL , data_key : DATA_KEY_TYPE = DataHandlerABC . DK_I ) -> list :
673714 """
674715 get the column names
675716
0 commit comments