- "value": "from copy import deepcopy\nfrom typing import Any\n\nfrom langflow.base.data.base_file import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import BoolInput, FileInput, IntInput, Output\nfrom langflow.schema.data import Data\n\n\nclass FileComponent(BaseFileComponent):\n \"\"\"Handles loading and processing of individual or zipped text files.\n\n This component supports processing multiple valid files within a zip archive,\n resolving paths, validating file types, and optionally using multithreading for processing.\n \"\"\"\n\n display_name = \"File\"\n description = \"Loads content from one or more files as a DataFrame.\"\n documentation: str = \"https://docs.langflow.org/components-data#file\"\n icon = \"file-text\"\n name = \"File\"\n\n VALID_EXTENSIONS = TEXT_FILE_TYPES\n\n _base_inputs = deepcopy(BaseFileComponent._base_inputs)\n\n for input_item in _base_inputs:\n if isinstance(input_item, FileInput) and input_item.name == \"path\":\n input_item.real_time_refresh = True\n break\n\n inputs = [\n *_base_inputs,\n BoolInput(\n name=\"use_multithreading\",\n display_name=\"[Deprecated] Use Multithreading\",\n advanced=True,\n value=True,\n info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n ),\n IntInput(\n name=\"concurrency_multithreading\",\n display_name=\"Processing Concurrency\",\n advanced=True,\n info=\"When multiple files are being processed, the number of files to process concurrently.\",\n value=1,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n ]\n\n def update_outputs(self, frontend_node: dict, field_name: str, field_value: Any) -> dict:\n \"\"\"Dynamically show only the relevant output based on the number of files processed.\"\"\"\n if field_name == \"path\":\n # Add outputs based on the number of files in the path\n if len(field_value) == 0:\n return frontend_node\n\n frontend_node[\"outputs\"] = []\n\n if len(field_value) == 1:\n # We need to check if the file is structured content\n file_path = frontend_node[\"template\"][\"path\"][\"file_path\"][0]\n if file_path.endswith((\".csv\", \".xlsx\", \".parquet\")):\n frontend_node[\"outputs\"].append(\n Output(display_name=\"Structured Content\", name=\"dataframe\", method=\"load_files_structured\"),\n )\n elif file_path.endswith(\".json\"):\n frontend_node[\"outputs\"].append(\n Output(display_name=\"Structured Content\", name=\"json\", method=\"load_files_json\"),\n )\n\n # All files get the raw content and path outputs\n frontend_node[\"outputs\"].append(\n Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n )\n frontend_node[\"outputs\"].append(\n Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n )\n else:\n # For multiple files, we only show the files output\n frontend_node[\"outputs\"].append(\n Output(display_name=\"Files\", name=\"dataframe\", method=\"load_files\"),\n )\n\n return frontend_node\n\n def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:\n \"\"\"Processes files either sequentially or in parallel, depending on concurrency settings.\n\n Args:\n file_list (list[BaseFileComponent.BaseFile]): List of files to process.\n\n Returns:\n list[BaseFileComponent.BaseFile]: Updated list of files with merged data.\n \"\"\"\n\n def process_file(file_path: str, *, silent_errors: bool = False) -> Data | None:\n \"\"\"Processes a single file and returns its Data object.\"\"\"\n try:\n return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n except FileNotFoundError as e:\n msg = f\"File not found: {file_path}. Error: {e}\"\n self.log(msg)\n if not silent_errors:\n raise\n return None\n except Exception as e:\n msg = f\"Unexpected error processing {file_path}: {e}\"\n self.log(msg)\n if not silent_errors:\n raise\n return None\n\n if not file_list:\n msg = \"No files to process.\"\n raise ValueError(msg)\n\n concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n file_count = len(file_list)\n\n parallel_processing_threshold = 2\n if concurrency < parallel_processing_threshold or file_count < parallel_processing_threshold:\n if file_count > 1:\n self.log(f\"Processing {file_count} files sequentially.\")\n processed_data = [process_file(str(file.path), silent_errors=self.silent_errors) for file in file_list]\n else:\n self.log(f\"Starting parallel processing of {file_count} files with concurrency: {concurrency}.\")\n file_paths = [str(file.path) for file in file_list]\n processed_data = parallel_load_data(\n file_paths,\n silent_errors=self.silent_errors,\n load_function=process_file,\n max_concurrency=concurrency,\n )\n\n # Use rollup_basefile_data to merge processed data with BaseFile objects\n return self.rollup_data(file_list, processed_data)\n"
0 commit comments