diff --git a/configurations/clients/sample/tools.yaml b/configurations/clients/sample/tools.yaml index d61b14ef..2d0dee7d 100644 --- a/configurations/clients/sample/tools.yaml +++ b/configurations/clients/sample/tools.yaml @@ -70,6 +70,7 @@ tools: fake_call: tool_call_id: *available_datasets_call_id version: full + include_indicator_count: true - type: dataset_structure <<: *shared_settings diff --git a/statgpt/app/chains/datasets_meta/available_datasets_tool.py b/statgpt/app/chains/datasets_meta/available_datasets_tool.py index 2e34f9f2..c6093eb1 100644 --- a/statgpt/app/chains/datasets_meta/available_datasets_tool.py +++ b/statgpt/app/chains/datasets_meta/available_datasets_tool.py @@ -27,9 +27,19 @@ async def _arun(self, inputs: dict) -> tuple[str, ToolArtifact]: versioned_datasets = await data_service.list_available_datasets(auth_context) datasets = [ds.data for ds in versioned_datasets] + indicator_counts: dict[str, int] | None = None + if self._tool_config.details.include_indicator_count: + indicator_counts = await data_service.get_indicator_counts( + auth_context, versioned_datasets + ) + formatter = DatasetsListFormatter(self._dataset_formatter_config, auth_context=auth_context) response = await formatter.format( - datasets, sort_by_name=True, add_stats=True, group_by_provider=True + datasets, + sort_by_name=True, + add_stats=True, + group_by_provider=True, + indicator_counts=indicator_counts, ) target = ChainParameters.get_target(inputs) diff --git a/statgpt/app/services/chat_facade.py b/statgpt/app/services/chat_facade.py index ad948228..2ab724bd 100644 --- a/statgpt/app/services/chat_facade.py +++ b/statgpt/app/services/chat_facade.py @@ -411,6 +411,21 @@ async def _load_datasets(self, auth_context: AuthContext) -> list[VersionedDataS async def list_available_datasets(self, auth_context: AuthContext) -> list[VersionedDataSet]: return await self._load_datasets(auth_context) + async def get_indicator_counts( + self, auth_context: AuthContext, versioned_datasets: list[VersionedDataSet] + ) -> dict[str, int]: + """Get indicator count per dataset `entity_id`.""" + vector_store = await self._get_indicators_vector_store(auth_context) + version_to_entity = { + ds.version.version_data_id: ds.data.entity_id for ds in versioned_datasets + } + counts_by_version = await vector_store.get_size_per_version(set(version_to_entity.keys())) + return { + version_to_entity[vid]: count + for vid, count in counts_by_version.items() + if vid in version_to_entity + } + async def get_dataset_hierarchy(self, auth_context: AuthContext) -> DatasetHierarchy | None: """Get first available dataset hierarchy from the channel data sources.""" diff --git a/statgpt/app/utils/formatters/dataset_base.py b/statgpt/app/utils/formatters/dataset_base.py index a753fd56..74a1b28d 100644 --- a/statgpt/app/utils/formatters/dataset_base.py +++ b/statgpt/app/utils/formatters/dataset_base.py @@ -58,5 +58,5 @@ async def _get_dataset_update_at(self, dataset: DataSet) -> str: return last_updated @abstractmethod - async def format(self, dataset: DataSet) -> str: + async def format(self, dataset: DataSet, indicator_count: int | None = None) -> str: pass diff --git a/statgpt/app/utils/formatters/dataset_detailed.py b/statgpt/app/utils/formatters/dataset_detailed.py index 528fd5b4..7a52c48b 100644 --- a/statgpt/app/utils/formatters/dataset_detailed.py +++ b/statgpt/app/utils/formatters/dataset_detailed.py @@ -93,9 +93,9 @@ def _append_attributes(self, dataset: DataSet, result: list[str]) -> None: f'{attribute_details_tabs}- {self._("Description")}: {attr.description}' ) - async def format(self, dataset: DataSet) -> str: + async def format(self, dataset: DataSet, indicator_count: int | None = None) -> str: result: list[str] = [] - await self._append_basic_info(dataset, result) + await self._append_basic_info(dataset, result, indicator_count=indicator_count) self._append_dimensions(dataset, result) self._append_attributes(dataset, result) return "\n".join(result) diff --git a/statgpt/app/utils/formatters/dataset_simple.py b/statgpt/app/utils/formatters/dataset_simple.py index 27048c3f..61b5a688 100644 --- a/statgpt/app/utils/formatters/dataset_simple.py +++ b/statgpt/app/utils/formatters/dataset_simple.py @@ -6,7 +6,9 @@ class SimpleDatasetFormatter(BaseDatasetFormatter): - async def _append_basic_info(self, dataset: DataSet, result: list[str]) -> None: + async def _append_basic_info( + self, dataset: DataSet, result: list[str], indicator_count: int | None = None + ) -> None: if self.config.include_name: name_str = f'**{dataset.name}**' if self.config.highlight_name_in_bold else dataset.name if self.config.official_dataset_label and dataset.config.is_official: @@ -50,7 +52,10 @@ async def _append_basic_info(self, dataset: DataSet, result: list[str]) -> None: ).format(citation) result.append(formatted_citation) - async def format(self, dataset: DataSet) -> str: + if indicator_count is not None: + result.append(f'{item_tabs}* {self._("Number of indicators")}: {indicator_count}') + + async def format(self, dataset: DataSet, indicator_count: int | None = None) -> str: result: list[str] = [] - await self._append_basic_info(dataset, result) + await self._append_basic_info(dataset, result, indicator_count=indicator_count) return "\n".join(result) diff --git a/statgpt/app/utils/formatters/datasets_list_formatter.py b/statgpt/app/utils/formatters/datasets_list_formatter.py index 441442f0..0fc69f28 100644 --- a/statgpt/app/utils/formatters/datasets_list_formatter.py +++ b/statgpt/app/utils/formatters/datasets_list_formatter.py @@ -34,6 +34,7 @@ async def format( sort_by_name: bool = False, add_stats: bool = False, group_by_provider: bool = False, + indicator_counts: dict[str, int] | None = None, ) -> str: if sort_by_id and sort_by_name: raise ValueError(self._("Cannot sort by both id and name.")) @@ -50,7 +51,8 @@ async def format( # Format each dataset for dataset in iterable: - entry = await self._formatter.format(dataset) + count = indicator_counts.get(dataset.entity_id) if indicator_counts else None + entry = await self._formatter.format(dataset, indicator_count=count) provider = dataset.config.citation.provider if dataset.config.citation else None dataset_entries[provider].append(entry) @@ -80,6 +82,10 @@ async def format( # Count unique providers providers = [p for p in dataset_entries.keys() if p is not None] stats_header += f'\n{self._("Total providers")}: {len(providers)}' + if indicator_counts is not None: + stats_header += ( + f'\n{self._("Total number of indicators")}: {sum(indicator_counts.values())}' + ) result = f'{stats_header}\n\n{datasets_list}' else: result = datasets_list diff --git a/statgpt/app/utils/formatters/locales/dataset.pot b/statgpt/app/utils/formatters/locales/dataset.pot index 81bb2762..dc98cbc4 100644 --- a/statgpt/app/utils/formatters/locales/dataset.pot +++ b/statgpt/app/utils/formatters/locales/dataset.pot @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2025-11-05 17:19+0200\n" +"POT-Creation-Date: 2026-03-06 14:48+0200\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -17,23 +17,27 @@ msgstr "" "Content-Type: text/plain; charset=CHARSET\n" "Content-Transfer-Encoding: 8bit\n" -#: dataset_simple.py:16 +#: dataset_simple.py:18 msgid "[Official]" msgstr "" -#: dataset_simple.py:28 dataset_query.py:125 dataset_query.py:247 +#: dataset_simple.py:30 dataset_query.py:125 dataset_query.py:247 #: dataset_availablity_query.py:105 msgid "ID" msgstr "" -#: dataset_simple.py:32 +#: dataset_simple.py:34 msgid "Internal ID" msgstr "" -#: dataset_simple.py:41 dataset_detailed.py:59 dataset_detailed.py:93 +#: dataset_simple.py:43 dataset_detailed.py:59 dataset_detailed.py:93 msgid "Description" msgstr "" +#: dataset_simple.py:56 +msgid "Number of indicators" +msgstr "" + #: dataset_detailed.py:21 dataset_detailed.py:31 msgid "Total" msgstr "" @@ -63,40 +67,44 @@ msgstr "" msgid "Attributes" msgstr "" -#: datasets_list_formatter.py:39 +#: datasets_list_formatter.py:40 msgid "Cannot sort by both id and name." msgstr "" -#: datasets_list_formatter.py:62 datasets_list_formatter.py:64 citation.py:28 +#: datasets_list_formatter.py:64 datasets_list_formatter.py:66 citation.py:28 msgid "Provider" msgstr "" -#: datasets_list_formatter.py:64 datasets_list_formatter.py:110 +#: datasets_list_formatter.py:66 datasets_list_formatter.py:116 msgid "Unknown" msgstr "" -#: datasets_list_formatter.py:67 +#: datasets_list_formatter.py:69 msgid "Total datasets from this provider" msgstr "" -#: datasets_list_formatter.py:78 datasets_list_formatter.py:100 -#: datasets_list_formatter.py:105 +#: datasets_list_formatter.py:80 datasets_list_formatter.py:106 +#: datasets_list_formatter.py:111 msgid "Total datasets" msgstr "" -#: datasets_list_formatter.py:82 +#: datasets_list_formatter.py:84 msgid "Total providers" msgstr "" -#: datasets_list_formatter.py:101 +#: datasets_list_formatter.py:87 +msgid "Total number of indicators" +msgstr "" + +#: datasets_list_formatter.py:107 msgid "Official" msgstr "" -#: datasets_list_formatter.py:102 +#: datasets_list_formatter.py:108 msgid "Unofficial" msgstr "" -#: datasets_list_formatter.py:114 +#: datasets_list_formatter.py:120 msgid "By provider" msgstr "" diff --git a/statgpt/app/utils/formatters/locales/en/LC_MESSAGES/dataset.mo b/statgpt/app/utils/formatters/locales/en/LC_MESSAGES/dataset.mo index 3ead0507..693f271d 100644 Binary files a/statgpt/app/utils/formatters/locales/en/LC_MESSAGES/dataset.mo and b/statgpt/app/utils/formatters/locales/en/LC_MESSAGES/dataset.mo differ diff --git a/statgpt/app/utils/formatters/locales/en/LC_MESSAGES/dataset.po b/statgpt/app/utils/formatters/locales/en/LC_MESSAGES/dataset.po index a6b978a9..6e1a5bb7 100644 --- a/statgpt/app/utils/formatters/locales/en/LC_MESSAGES/dataset.po +++ b/statgpt/app/utils/formatters/locales/en/LC_MESSAGES/dataset.po @@ -6,7 +6,7 @@ msgid "" msgstr "" "Project-Id-Version: statgpt 1.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2025-11-05 17:19+0200\n" +"POT-Creation-Date: 2026-03-06 14:48+0200\n" "PO-Revision-Date: 2024-01-01 00:00+0000\n" "Last-Translator: \n" "Language-Team: English\n" @@ -15,24 +15,28 @@ msgstr "" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -#: dataset_simple.py:16 +#: dataset_simple.py:18 msgid "[Official]" msgstr "[Official]" # Basic labels -#: dataset_simple.py:28 dataset_query.py:125 dataset_query.py:247 +#: dataset_simple.py:30 dataset_query.py:125 dataset_query.py:247 #: dataset_availablity_query.py:105 msgid "ID" msgstr "ID" -#: dataset_simple.py:32 +#: dataset_simple.py:34 msgid "Internal ID" msgstr "Internal ID" -#: dataset_simple.py:41 dataset_detailed.py:59 dataset_detailed.py:93 +#: dataset_simple.py:43 dataset_detailed.py:59 dataset_detailed.py:93 msgid "Description" msgstr "Description" +#: dataset_simple.py:56 +msgid "Number of indicators" +msgstr "Number of indicators" + #: dataset_detailed.py:21 dataset_detailed.py:31 msgid "Total" msgstr "Total" @@ -63,40 +67,44 @@ msgid "Attributes" msgstr "Attributes" # List formatter strings -#: datasets_list_formatter.py:39 +#: datasets_list_formatter.py:40 msgid "Cannot sort by both id and name." msgstr "Cannot sort by both id and name." -#: datasets_list_formatter.py:62 datasets_list_formatter.py:64 citation.py:28 +#: datasets_list_formatter.py:64 datasets_list_formatter.py:66 citation.py:28 msgid "Provider" msgstr "Provider" -#: datasets_list_formatter.py:64 datasets_list_formatter.py:110 +#: datasets_list_formatter.py:66 datasets_list_formatter.py:116 msgid "Unknown" msgstr "Unknown" -#: datasets_list_formatter.py:67 +#: datasets_list_formatter.py:69 msgid "Total datasets from this provider" msgstr "Total datasets from this provider" -#: datasets_list_formatter.py:78 datasets_list_formatter.py:100 -#: datasets_list_formatter.py:105 +#: datasets_list_formatter.py:80 datasets_list_formatter.py:106 +#: datasets_list_formatter.py:111 msgid "Total datasets" msgstr "Total datasets" -#: datasets_list_formatter.py:82 +#: datasets_list_formatter.py:84 msgid "Total providers" msgstr "Total providers" -#: datasets_list_formatter.py:101 +#: datasets_list_formatter.py:87 +msgid "Total number of indicators" +msgstr "Total number of indicators" + +#: datasets_list_formatter.py:107 msgid "Official" msgstr "Official" -#: datasets_list_formatter.py:102 +#: datasets_list_formatter.py:108 msgid "Unofficial" msgstr "Unofficial" -#: datasets_list_formatter.py:114 +#: datasets_list_formatter.py:120 msgid "By provider" msgstr "By provider" diff --git a/statgpt/app/utils/formatters/locales/uk/LC_MESSAGES/dataset.mo b/statgpt/app/utils/formatters/locales/uk/LC_MESSAGES/dataset.mo index eb626624..f1f49ab4 100644 Binary files a/statgpt/app/utils/formatters/locales/uk/LC_MESSAGES/dataset.mo and b/statgpt/app/utils/formatters/locales/uk/LC_MESSAGES/dataset.mo differ diff --git a/statgpt/app/utils/formatters/locales/uk/LC_MESSAGES/dataset.po b/statgpt/app/utils/formatters/locales/uk/LC_MESSAGES/dataset.po index 5fc0cd9f..df33594a 100644 --- a/statgpt/app/utils/formatters/locales/uk/LC_MESSAGES/dataset.po +++ b/statgpt/app/utils/formatters/locales/uk/LC_MESSAGES/dataset.po @@ -6,7 +6,7 @@ msgid "" msgstr "" "Project-Id-Version: statgpt 1.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2025-11-05 17:19+0200\n" +"POT-Creation-Date: 2026-03-06 14:48+0200\n" "PO-Revision-Date: 2024-01-01 00:00+0000\n" "Last-Translator: \n" "Language-Team: Ukrainian\n" @@ -15,24 +15,28 @@ msgstr "" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -#: dataset_simple.py:16 +#: dataset_simple.py:18 msgid "[Official]" msgstr "[Офіційний]" # Basic labels -#: dataset_simple.py:28 dataset_query.py:125 dataset_query.py:247 +#: dataset_simple.py:30 dataset_query.py:125 dataset_query.py:247 #: dataset_availablity_query.py:105 msgid "ID" msgstr "Ідентифікатор" -#: dataset_simple.py:32 +#: dataset_simple.py:34 msgid "Internal ID" msgstr "Внутрішній ID" -#: dataset_simple.py:41 dataset_detailed.py:59 dataset_detailed.py:93 +#: dataset_simple.py:43 dataset_detailed.py:59 dataset_detailed.py:93 msgid "Description" msgstr "Опис" +#: dataset_simple.py:56 +msgid "Number of indicators" +msgstr "Кількість індикаторів" + #: dataset_detailed.py:21 dataset_detailed.py:31 msgid "Total" msgstr "Всього" @@ -63,40 +67,44 @@ msgid "Attributes" msgstr "Атрибути" # List formatter strings -#: datasets_list_formatter.py:39 +#: datasets_list_formatter.py:40 msgid "Cannot sort by both id and name." msgstr "Неможливо сортувати одночасно за ID та назвою." -#: datasets_list_formatter.py:62 datasets_list_formatter.py:64 citation.py:28 +#: datasets_list_formatter.py:64 datasets_list_formatter.py:66 citation.py:28 msgid "Provider" msgstr "Постачальник" -#: datasets_list_formatter.py:64 datasets_list_formatter.py:110 +#: datasets_list_formatter.py:66 datasets_list_formatter.py:116 msgid "Unknown" msgstr "Невідомо" -#: datasets_list_formatter.py:67 +#: datasets_list_formatter.py:69 msgid "Total datasets from this provider" msgstr "Всього наборів даних від цього постачальника" -#: datasets_list_formatter.py:78 datasets_list_formatter.py:100 -#: datasets_list_formatter.py:105 +#: datasets_list_formatter.py:80 datasets_list_formatter.py:106 +#: datasets_list_formatter.py:111 msgid "Total datasets" msgstr "Всього наборів даних" -#: datasets_list_formatter.py:82 +#: datasets_list_formatter.py:84 msgid "Total providers" msgstr "Всього постачальників" -#: datasets_list_formatter.py:101 +#: datasets_list_formatter.py:87 +msgid "Total number of indicators" +msgstr "Загальна кількість індикаторів" + +#: datasets_list_formatter.py:107 msgid "Official" msgstr "Офіційні" -#: datasets_list_formatter.py:102 +#: datasets_list_formatter.py:108 msgid "Unofficial" msgstr "Неофіційні" -#: datasets_list_formatter.py:114 +#: datasets_list_formatter.py:120 msgid "By provider" msgstr "За постачальником" diff --git a/statgpt/common/schemas/tool_details.py b/statgpt/common/schemas/tool_details.py index a7090d4b..c6d7761f 100644 --- a/statgpt/common/schemas/tool_details.py +++ b/statgpt/common/schemas/tool_details.py @@ -185,6 +185,10 @@ class AvailableDatasetsDetails(BaseToolDetails): default=AvailableDatasetsVersion.short, description="The version of the available datasets tool", ) + include_indicator_count: bool = Field( + default=False, + description="Whether to include the number of indexed indicators per dataset and total.", + ) class OneShotToolDetails(BaseToolDetails): diff --git a/statgpt/common/vectorstore/base.py b/statgpt/common/vectorstore/base.py index b048f331..5858847e 100644 --- a/statgpt/common/vectorstore/base.py +++ b/statgpt/common/vectorstore/base.py @@ -66,6 +66,10 @@ async def get_total_size(self) -> int: async def get_size(self, version_ids: set[int]) -> int: """Returns the number of documents in the vector store for the specified version IDs.""" + @abstractmethod + async def get_size_per_version(self, version_ids: set[int]) -> dict[int, int]: + """Returns the number of documents per version_id.""" + @abstractmethod async def export_to_folder(self, folder_path: str, version_ids: set[int]) -> None: """Exports the vector store data to the specified folder.""" diff --git a/statgpt/common/vectorstore/pg_vector_store/pg_vector_store.py b/statgpt/common/vectorstore/pg_vector_store/pg_vector_store.py index 45df4669..965e2533 100644 --- a/statgpt/common/vectorstore/pg_vector_store/pg_vector_store.py +++ b/statgpt/common/vectorstore/pg_vector_store/pg_vector_store.py @@ -539,7 +539,6 @@ async def has_duplicates_in_versions(self, version_ids: set[int]) -> tuple[bool, return await self._get_duplicates(session, duplicate_check_query, params=params) async def get_total_size(self) -> int: - """Returns the total number of documents in the vector store.""" metadata_model = await self._get_metadata_model() async with self._lock_session() as session: @@ -555,7 +554,6 @@ async def get_total_size(self) -> int: return size async def get_size(self, version_ids: set[int]) -> int: - """Returns the number of documents in the vector store.""" metadata_model = await self._get_metadata_model() async with self._lock_session() as session: @@ -572,6 +570,25 @@ async def get_size(self, version_ids: set[int]) -> int: size = result.scalar_one() return size + async def get_size_per_version(self, version_ids: set[int]) -> dict[int, int]: + metadata_model = await self._get_metadata_model() + + async with self._lock_session() as session: + if not await self._check_if_table_exists(session, metadata_model.__tablename__): + return {} + + query = ( + select( + metadata_model.version_id, + func.count(func.distinct(metadata_model.document_id)), + ) + .select_from(metadata_model) + .where(metadata_model.version_id.in_(version_ids)) + .group_by(metadata_model.version_id) + ) + result = await session.execute(query) + return dict(result.all()) + async def deduplicate_by_document_content(self) -> None: """Removes and remaps duplicate documents based on `document` field content.