66import logging
77import re
88from io import BytesIO
9- from typing import cast
109
1110import pandas as pd
1211
13- from graphrag .index . config .input import PipelineCSVInputConfig , PipelineInputConfig
12+ from graphrag .config .models . input_config import InputConfig
1413from graphrag .index .utils .hashing import gen_sha512_hash
1514from graphrag .logger .base import ProgressLogger
1615from graphrag .storage .pipeline_storage import PipelineStorage
2322
2423
2524async def load (
26- config : PipelineInputConfig ,
25+ config : InputConfig ,
2726 progress : ProgressLogger | None ,
2827 storage : PipelineStorage ,
2928) -> pd .DataFrame :
3029 """Load csv inputs from a directory."""
31- csv_config = cast ("PipelineCSVInputConfig" , config )
32- log .info ("Loading csv files from %s" , csv_config .base_dir )
30+ log .info ("Loading csv files from %s" , config .base_dir )
3331
3432 async def load_file (path : str , group : dict | None ) -> pd .DataFrame :
3533 if group is None :
@@ -43,51 +41,49 @@ async def load_file(path: str, group: dict | None) -> pd.DataFrame:
4341 )
4442 if "id" not in data .columns :
4543 data ["id" ] = data .apply (lambda x : gen_sha512_hash (x , x .keys ()), axis = 1 )
46- if csv_config .source_column is not None and "source" not in data .columns :
47- if csv_config .source_column not in data .columns :
44+ if config .source_column is not None and "source" not in data .columns :
45+ if config .source_column not in data .columns :
4846 log .warning (
4947 "source_column %s not found in csv file %s" ,
50- csv_config .source_column ,
48+ config .source_column ,
5149 path ,
5250 )
5351 else :
54- data ["source" ] = data .apply (
55- lambda x : x [csv_config .source_column ], axis = 1
56- )
57- if csv_config .text_column is not None and "text" not in data .columns :
58- if csv_config .text_column not in data .columns :
52+ data ["source" ] = data .apply (lambda x : x [config .source_column ], axis = 1 )
53+ if config .text_column is not None and "text" not in data .columns :
54+ if config .text_column not in data .columns :
5955 log .warning (
6056 "text_column %s not found in csv file %s" ,
61- csv_config .text_column ,
57+ config .text_column ,
6258 path ,
6359 )
6460 else :
65- data ["text" ] = data .apply (lambda x : x [csv_config .text_column ], axis = 1 )
66- if csv_config .title_column is not None and "title" not in data .columns :
67- if csv_config .title_column not in data .columns :
61+ data ["text" ] = data .apply (lambda x : x [config .text_column ], axis = 1 )
62+ if config .title_column is not None and "title" not in data .columns :
63+ if config .title_column not in data .columns :
6864 log .warning (
6965 "title_column %s not found in csv file %s" ,
70- csv_config .title_column ,
66+ config .title_column ,
7167 path ,
7268 )
7369 else :
74- data ["title" ] = data .apply (lambda x : x [csv_config .title_column ], axis = 1 )
70+ data ["title" ] = data .apply (lambda x : x [config .title_column ], axis = 1 )
7571
76- if csv_config .timestamp_column is not None :
77- fmt = csv_config .timestamp_format
72+ if config .timestamp_column is not None :
73+ fmt = config .timestamp_format
7874 if fmt is None :
7975 msg = "Must specify timestamp_format if timestamp_column is specified"
8076 raise ValueError (msg )
8177
82- if csv_config .timestamp_column not in data .columns :
78+ if config .timestamp_column not in data .columns :
8379 log .warning (
8480 "timestamp_column %s not found in csv file %s" ,
85- csv_config .timestamp_column ,
81+ config .timestamp_column ,
8682 path ,
8783 )
8884 else :
8985 data ["timestamp" ] = pd .to_datetime (
90- data [csv_config .timestamp_column ], format = fmt
86+ data [config .timestamp_column ], format = fmt
9187 )
9288
9389 # TODO: Theres probably a less gross way to do this
0 commit comments