Marker-Inc-Korea
diff --git a/‎README.md‎
Lines changed: 21 additions & 28 deletions b/‎README.md‎
Lines changed: 21 additions & 28 deletions
diff --git a/‎autorag/autorag/data/legacy/qacreation/base.py‎
Lines changed: 1 addition & 1 deletion b/‎autorag/autorag/data/legacy/qacreation/base.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autorag/autorag/deploy/api.py‎
Lines changed: 12 additions & 2 deletions b/‎autorag/autorag/deploy/api.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎autorag/autorag/evaluator.py‎
Lines changed: 2 additions & 2 deletions b/‎autorag/autorag/evaluator.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎autorag/autorag/nodes/hybridretrieval/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎autorag/autorag/nodes/hybridretrieval/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎autorag/autorag/nodes/hybridretrieval/base.py‎
Lines changed: 58 additions & 0 deletions b/‎autorag/autorag/nodes/hybridretrieval/base.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎…rag/autorag/nodes/retrieval/hybrid_cc.py‎ ‎…torag/nodes/hybridretrieval/hybrid_cc.py‎autorag/autorag/nodes/retrieval/hybrid_cc.py renamed to autorag/autorag/nodes/hybridretrieval/hybrid_cc.py
Lines changed: 46 additions & 33 deletions b/‎…rag/autorag/nodes/retrieval/hybrid_cc.py‎ ‎…torag/nodes/hybridretrieval/hybrid_cc.py‎autorag/autorag/nodes/retrieval/hybrid_cc.py renamed to autorag/autorag/nodes/hybridretrieval/hybrid_cc.py
Lines changed: 46 additions & 33 deletions
@@ -4,11 +4,10 @@ RAG AutoML tool for automatically finding an optimal RAG pipeline for your data.
 
 ![Thumbnail](https://github.com/user-attachments/assets/6bab243d-a4b3-431a-8ac0-fe17336ab4de)
 
-![Discord](https://img.shields.io/discord/1204010535272587264) ![PyPI - Downloads](https://img.shields.io/pypi/dm/AutoRAG)
+![PyPI - Downloads](https://img.shields.io/pypi/dm/AutoRAG)
 [![LinkedIn](https://img.shields.io/badge/LinkedIn-Connect-blue?style=flat-square&logo=linkedin)](https://www.linkedin.com/company/104375108/admin/dashboard/)
 ![X (formerly Twitter) Follow](https://img.shields.io/twitter/follow/AutoRAG_HQ)
 [![Hugging Face](https://img.shields.io/badge/Hugging%20Face-Follow-orange?style=flat-square&logo=huggingface)](https://huggingface.co/AutoRAG)
-[![Static Badge](https://img.shields.io/badge/Roadmap-5D3FD3)](https://github.com/orgs/Auto-RAG/projects/1/views/2)
 
 <img src=https://github.com/user-attachments/assets/9a4d0381-a161-457f-a787-e7eb3593ce00 width="251.5" height="55.2"/>
 
@@ -26,26 +25,10 @@ Try now and find the best RAG pipeline for your own use-case.
 
 Explore our 📖 [Document](https://marker-inc-korea.github.io/AutoRAG/)!!
 
----
-
-## AutoRAG GUI (beta)
-
-AutoRAG GUI is a web-based GUI for AutoRAG.
-If AutoRAG is a little bit complicated to you, try AutoRAG GUI.
-
-Your Optimized RAG pipeline is just a few clicks away.
-
-|                                    Project Management                                     |                                    Easy Configuration                                     |                                     Parsed Page View                                      |
-|:-----------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------:|
-| ![Image](https://github.com/user-attachments/assets/87289d84-ff65-4810-bc41-3f30b36b7ddf) | ![Image](https://github.com/user-attachments/assets/dbe0a49b-ebf2-4c9c-b17d-1be1c2cd1060) | ![Image](https://github.com/user-attachments/assets/d8a50512-3299-4b68-b48e-e2f49d688f01) |
-
-Click the docs to use the AutoRAG GUI beta version! [AutoRAG GUI Docs](https://marker-inc-korea.github.io/AutoRAG/gui/gui.html).
-
-### GUI Installation
-
-1. Clone the repository
-2. Run Docker Compose `docker compose up -d`
-3. Access the GUI at `http://localhost:3000`
+```
+Notice: We are no longer support "AutoRAG GUI"
+And we will focus to maintain only AutoRAG core library in the future. Thank you.
+```
 
 ---
 
@@ -293,23 +276,33 @@ We highly recommend using pre-made config YAML files for starter.
     - [Sample YAML Guide](https://marker-inc-korea.github.io/AutoRAG/optimization/sample_config.html)
 - [Make Custom YAML Guide](https://marker-inc-korea.github.io/AutoRAG/optimization/custom_config.html)
 
-Here is an example of the config YAML file to use `retrieval`, `prompt_maker`, and `generator` nodes.
+Here is an example of the config YAML file to use three retrieval nodes, `prompt_maker`, and `generator` nodes.
 
 ```yaml
 node_lines:
-  - node_line_name: retrieve_node_line  # Set Node Line (Arbitrary Name)
+  - node_line_name: retrieve_node_line
     nodes:
-      - node_type: retrieval  # Set Retrieval Node
+      - node_type: lexical_retrieval
+        strategy:
+          metrics: [ retrieval_f1, retrieval_recall, retrieval_ndcg, retrieval_mrr ]
+        top_k: 3
+        modules:
+          - module_type: bm25
+      - node_type: semantic_retrieval
         strategy:
-          metrics: [ retrieval_f1, retrieval_recall, retrieval_ndcg, retrieval_mrr ]  # Set Retrieval Metrics
+          metrics: [ retrieval_f1, retrieval_recall, retrieval_ndcg, retrieval_mrr ]
         top_k: 3
         modules:
           - module_type: vectordb
             vectordb: default
-          - module_type: bm25
+      - node_type: hybrid_retrieval
+        strategy:
+          metrics: [ retrieval_f1, retrieval_recall, retrieval_ndcg, retrieval_mrr ]
+        top_k: 3
+        modules:
           - module_type: hybrid_rrf
             weight_range: (4,80)
-  - node_line_name: post_retrieve_node_line  # Set Node Line (Arbitrary Name)
+  - node_line_name: post_retrieve_node_line
     nodes:
       - node_type: prompt_maker  # Set Prompt Maker Node
         strategy:
 
@@ -8,7 +8,7 @@
 from tqdm import tqdm
 
 import autorag
-from autorag.nodes.retrieval.vectordb import vectordb_ingest_api, vectordb_pure
+from autorag.nodes.semanticretrieval.vectordb import vectordb_ingest_api, vectordb_pure
 from autorag.utils.util import (
 	save_parquet_safe,
 	fetch_contents,
 
@@ -248,9 +248,19 @@ def run_api_server(
 		self.app.run(host=host, port=port, **kwargs)
 
 	def extract_retrieve_passage(self, df: pd.DataFrame) -> List[RetrievedPassage]:
-		retrieved_ids: List[str] = df["retrieved_ids"].tolist()[0]
+		if "retrieved_ids" not in df.columns and "retrieved_ids_semantic" in df.columns:
+			retrieved_ids: List[str] = df["retrieved_ids_semantic"].tolist()[0]
+			scores = df["retrieve_scores_semantic"].tolist()[0]
+		elif (
+			"retrieved_ids" not in df.columns
+			and "retrieved_ids_semantic" not in df.columns
+		):
+			retrieved_ids: List[str] = df["retrieved_ids_lexical"].tolist()[0]
+			scores = df["retrieve_scores_lexical"].tolist()[0]
+		else:
+			retrieved_ids: List[str] = df["retrieved_ids"].tolist()[0]
+			scores = df["retrieve_scores"].tolist()[0]
 		contents = fetch_contents(self.corpus_df, [retrieved_ids])[0]
-		scores = df["retrieve_scores"].tolist()[0]
 		if "path" in self.corpus_df.columns:
 			paths = fetch_contents(self.corpus_df, [retrieved_ids], column_name="path")[
 				0
 
@@ -12,8 +12,8 @@
 
 from autorag.node_line import run_node_line
 from autorag.nodes.retrieval.base import get_bm25_pkl_name
-from autorag.nodes.retrieval.bm25 import bm25_ingest
-from autorag.nodes.retrieval.vectordb import (
+from autorag.nodes.lexicalretrieval.bm25 import bm25_ingest
+from autorag.nodes.semanticretrieval.vectordb import (
 	vectordb_ingest_api,
 	filter_exist_ids,
 	filter_exist_ids_from_retrieval_gt,
 
@@ -0,0 +1,2 @@
+from .hybrid_cc import HybridCC
+from .hybrid_rrf import HybridRRF
@@ -0,0 +1,58 @@
+import abc
+
+import pandas as pd
+
+from autorag.nodes.retrieval.base import BaseRetrieval
+from autorag.utils import result_to_dataframe
+from autorag.utils.util import pop_params, fetch_contents
+
+
+class HybridRetrieval(BaseRetrieval, metaclass=abc.ABCMeta):
+	def __init__(self, project_dir: str, *args, **kwargs):
+		super().__init__(project_dir)
+
+	@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		previous_info = self.cast_to_run(previous_result, *args, **kwargs)
+		_pure_params = pop_params(self._pure, kwargs)
+		ids, scores = self._pure(previous_info, **_pure_params)
+		contents = fetch_contents(self.corpus_df, ids)
+		return contents, ids, scores
+
+	def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
+		return hybrid_cast(previous_result)
+
+	@classmethod
+	def cast_to_run_class(cls, previous_result: pd.DataFrame):
+		return hybrid_cast(previous_result)
+
+
+def hybrid_cast(
+	previous_result: pd.DataFrame,
+):
+	assert "query" in previous_result.columns, "previous_result must have query column."
+	queries = previous_result["query"].tolist()
+
+	assert "retrieved_contents_semantic" in previous_result.columns
+	assert "retrieved_contents_lexical" in previous_result.columns
+	assert "retrieve_scores_semantic" in previous_result.columns
+	assert "retrieve_scores_lexical" in previous_result.columns
+	assert "retrieved_ids_semantic" in previous_result.columns
+	assert "retrieved_ids_lexical" in previous_result.columns
+
+	contents_semantic = previous_result["retrieved_contents_semantic"].tolist()
+	contents_lexical = previous_result["retrieved_contents_lexical"].tolist()
+	scores_semantic = previous_result["retrieve_scores_semantic"].tolist()
+	scores_lexical = previous_result["retrieve_scores_lexical"].tolist()
+	ids_semantic = previous_result["retrieved_ids_semantic"].tolist()
+	ids_lexical = previous_result["retrieved_ids_lexical"].tolist()
+
+	return {
+		"queries": queries,
+		"retrieved_contents_semantic": contents_semantic,
+		"retrieved_contents_lexical": contents_lexical,
+		"retrieve_scores_semantic": scores_semantic,
+		"retrieve_scores_lexical": scores_lexical,
+		"retrieved_ids_semantic": ids_semantic,
+		"retrieved_ids_lexical": ids_lexical,
+	}
@@ -1,12 +1,12 @@
-import os
 from pathlib import Path
 from typing import Tuple, List, Union
 
 import numpy as np
 import pandas as pd
 
-from autorag.nodes.retrieval.base import HybridRetrieval
-from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
+from autorag.nodes.hybridretrieval.base import HybridRetrieval
+from autorag.nodes.hybridretrieval.run import evaluate_retrieval_node
+from autorag.strategy import select_best
 
 
 def normalize_mm(scores: List[str], fixed_min_value: float = 0):
@@ -53,17 +53,16 @@ def normalize_dbsf(scores: List[str], fixed_min_value: float = 0):
 class HybridCC(HybridRetrieval):
 	def _pure(
 		self,
-		ids: Tuple,
-		scores: Tuple,
+		info: dict,
 		top_k: int,
 		weight: float,
 		normalize_method: str = "mm",
 		semantic_theoretical_min_value: float = -1.0,
 		lexical_theoretical_min_value: float = 0.0,
 	):
 		return hybrid_cc(
-			ids,
-			scores,
+			(info["retrieved_ids_semantic"], info["retrieved_ids_lexical"]),
+			(info["retrieve_scores_semantic"], info["retrieve_scores_lexical"]),
 			top_k,
 			weight,
 			normalize_method,
@@ -79,34 +78,48 @@ def run_evaluator(
 		*args,
 		**kwargs,
 	):
-		if "ids" in kwargs and "scores" in kwargs:
-			data_dir = os.path.join(project_dir, "data")
-			corpus_df = pd.read_parquet(
-				os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
+		assert "strategy" in kwargs, "You must specify the strategy to use."
+		assert (
+			"input_metrics" in kwargs
+		), "You must specify the input metrics to use, which is list of MetricInput."
+		strategies = kwargs.pop("strategy")
+		input_metrics = kwargs.pop("input_metrics")
+		weight_range = kwargs.pop("weight_range", (0.0, 1.0))
+		test_weight_size = kwargs.pop("test_weight_size", 101)
+		weight_candidates = np.linspace(
+			weight_range[0], weight_range[1], test_weight_size
+		).tolist()
+
+		result_list = []
+		instance = cls(project_dir, *args, **kwargs)
+		for weight_value in weight_candidates:
+			result_df = instance.pure(previous_result, weight=weight_value, **kwargs)
+			result_list.append(result_df)
+
+		if strategies.get("metrics") is None:
+			raise ValueError("You must at least one metrics for retrieval evaluation.")
+		result_list = list(
+			map(
+				lambda x: evaluate_retrieval_node(
+					x,
+					input_metrics,
+					strategies.get("metrics"),
+				),
+				result_list,
 			)
+		)
 
-			params = pop_params(hybrid_cc, kwargs)
-			assert (
-				"ids" in params and "scores" in params and "top_k" in params
-			), "ids, scores, and top_k must be specified."
-
-			@result_to_dataframe(
-				["retrieved_contents", "retrieved_ids", "retrieve_scores"]
-			)
-			def __cc(**cc_params):
-				ids, scores = hybrid_cc(**cc_params)
-				contents = fetch_contents(corpus_df, ids)
-				return contents, ids, scores
-
-			return __cc(**params)
-		else:
-			assert (
-				"target_modules" in kwargs and "target_module_params" in kwargs
-			), "target_modules and target_module_params must be specified if there is not ids and scores."
-			instance = cls(project_dir, *args, **kwargs)
-			result = instance.pure(previous_result, *args, **kwargs)
-			del instance
-			return result
+		# select best result
+		best_result_df, best_weight = select_best(
+			result_list,
+			strategies.get("metrics"),
+			metadatas=weight_candidates,
+			strategy_name=strategies.get("strategy", "normalize_mean"),
+		)
+		return {
+			"best_result": best_result_df,
+			"best_weight": best_weight,
+		}
 
 
 def hybrid_cc(
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .hybrid_cc import HybridCC`
	`2`	`+from .hybrid_rrf import HybridRRF`