googleapis
diff --git a/‎bigframes/operations/semantics.py
Lines changed: 124 additions & 6 deletions b/‎bigframes/operations/semantics.py
Lines changed: 124 additions & 6 deletions
@@ -15,6 +15,7 @@
 
 import re
 import typing
+from typing import List
 
 import bigframes
 
@@ -97,7 +98,7 @@ def map(self, instruction: str, output_column: str, model):
             >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
 
             >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]})
-            >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", result_column_name="food", model=model)
+            >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model)
               ingredient_1 ingredient_2      food
             0   Burger Bun   Beef Patty  Burger
             <BLANKLINE>
@@ -148,11 +149,7 @@ def map(self, instruction: str, output_column: str, model):
         return concat([self._df, results.rename(output_column)], axis=1)
 
     def _make_prompt(self, user_instruction: str, output_instruction: str):
-        # Validate column references
-        columns = re.findall(r"(?<!{)\{(?!{)(.*?)\}(?!\})", user_instruction)
-
-        if not columns:
-            raise ValueError("No column references.")
+        columns = _parse_columns(user_instruction)
 
         for column in columns:
             if column not in self._df.columns:
@@ -170,9 +167,130 @@ def _make_prompt(self, user_instruction: str, output_instruction: str):
 
         return prompt_df["prompt"]
 
+    def join(self, other, instruction: str, model, max_rows: int = 1000):
+        """
+        Joines two dataframes by applying the instruction over each pair of rows from
+        the left and right table.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> import bigframes
+            >>> bigframes.options.experiments.semantic_operators = True
+
+            >>> import bigframes.ml.llm as llm
+            >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
+
+            >>> cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']})
+            >>> continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']})
+
+            >>> cities.semantics.join(continents, "{city} is in {continent}", model)
+                    city      continent
+            0    Seattle  North America
+            1     Ottawa  North America
+            2   Shanghai           Asia
+            3  New Delhi           Asia
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+        Args:
+            other:
+                The other dataframe.
+
+            instruction:
+                An instruction on how left and right rows can be joined. This value must contain
+                column references by name. which should be wrapped in a pair of braces.
+                For example: "The {city} belongs to the {country}".
+                For column names that are shared between two dataframes, you need to add "_left"
+                and "_right" suffix for differentiation. This is especially important when you do
+                self joins. For example: "The {employee_name_left} reports to {employee_name_right}"
+                You must not add "_left" or "_right" suffix to non-overlapping columns.
+
+            model:
+                A GeminiTextGenerator provided by Bigframes ML package.
+
+            max_rows:
+                The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method
+                call will end early with an error.
+
+        Returns:
+            The joined dataframe.
+
+        Raises:
+            ValueError if the amount of data that will be sent for LLM processing is larger than max_rows.
+        """
+        _validate_model(model)
+
+        joined_table_rows = len(self._df) * len(other)
+
+        if joined_table_rows > max_rows:
+            raise ValueError(
+                f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}."
+            )
+
+        columns = _parse_columns(instruction)
+
+        left_columns = []
+        right_columns = []
+
+        for col in columns:
+            if col in self._df.columns and col in other.columns:
+                raise ValueError(f"Ambiguous column reference: {col}")
+
+            elif col in self._df.columns:
+                left_columns.append(col)
+
+            elif col in other.columns:
+                right_columns.append(col)
+
+            elif col.endswith("_left"):
+                original_col_name = col[: -len("_left")]
+                if (
+                    original_col_name in self._df.columns
+                    and original_col_name in other.columns
+                ):
+                    left_columns.append(col)
+                elif original_col_name in self._df.columns:
+                    raise ValueError(f"Unnecessary suffix for {col}")
+                else:
+                    raise ValueError(f"Column {col} not found")
+
+            elif col.endswith("_right"):
+                original_col_name = col[: -len("_right")]
+                if (
+                    original_col_name in self._df.columns
+                    and original_col_name in other.columns
+                ):
+                    right_columns.append(col)
+                elif original_col_name in other.columns:
+                    raise ValueError(f"Unnecessary suffix for {col}")
+                else:
+                    raise ValueError(f"Column {col} not found")
+
+            else:
+                raise ValueError(f"Column {col} not found")
+
+        if not left_columns or not right_columns:
+            raise ValueError()
+
+        joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right"))
+
+        return joined_df.semantics.filter(instruction, model).reset_index(drop=True)
+
 
 def _validate_model(model):
     from bigframes.ml.llm import GeminiTextGenerator
 
     if not isinstance(model, GeminiTextGenerator):
         raise ValueError("Model is not GeminiText Generator")
+
+
+def _parse_columns(instruction: str) -> List[str]:
+    columns = re.findall(r"(?<!{)\{(?!{)(.*?)\}(?!\})", instruction)
+
+    if not columns:
+        raise ValueError("No column references")
+
+    return columns