|
19 | 19 | from typing import Dict, Iterable, List, Optional, Sequence, Union
|
20 | 20 | import warnings
|
21 | 21 |
|
22 |
| -import numpy as np |
23 |
| - |
24 | 22 | from bigframes import dtypes, exceptions, options
|
25 | 23 | from bigframes.core import guid, log_adapter
|
26 | 24 |
|
@@ -586,207 +584,6 @@ def search(
|
586 | 584 |
|
587 | 585 | return typing.cast(bigframes.dataframe.DataFrame, search_result)
|
588 | 586 |
|
589 |
| - def top_k( |
590 |
| - self, |
591 |
| - instruction: str, |
592 |
| - model, |
593 |
| - k: int = 10, |
594 |
| - ground_with_google_search: bool = False, |
595 |
| - ): |
596 |
| - """ |
597 |
| - Ranks each tuple and returns the k best according to the instruction. |
598 |
| -
|
599 |
| - This method employs a quick select algorithm to efficiently compare the pivot |
600 |
| - with all other items. By leveraging an LLM (Large Language Model), it then |
601 |
| - identifies the top 'k' best answers from these comparisons. |
602 |
| -
|
603 |
| - **Examples:** |
604 |
| -
|
605 |
| - >>> import bigframes.pandas as bpd |
606 |
| - >>> bpd.options.display.progress_bar = None |
607 |
| - >>> bpd.options.experiments.ai_operators = True |
608 |
| - >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 |
609 |
| -
|
610 |
| - >>> import bigframes.ml.llm as llm |
611 |
| - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") |
612 |
| -
|
613 |
| - >>> df = bpd.DataFrame( |
614 |
| - ... { |
615 |
| - ... "Animals": ["Dog", "Bird", "Cat", "Horse"], |
616 |
| - ... "Sounds": ["Woof", "Chirp", "Meow", "Neigh"], |
617 |
| - ... }) |
618 |
| - >>> df.ai.top_k("{Animals} are more popular as pets", model=model, k=2) |
619 |
| - Animals Sounds |
620 |
| - 0 Dog Woof |
621 |
| - 2 Cat Meow |
622 |
| - <BLANKLINE> |
623 |
| - [2 rows x 2 columns] |
624 |
| -
|
625 |
| - Args: |
626 |
| - instruction (str): |
627 |
| - An instruction on how to map the data. This value must contain |
628 |
| - column references by name enclosed in braces. |
629 |
| - For example, to reference a column named "Animals", use "{Animals}" in the |
630 |
| - instruction, like: "{Animals} are more popular as pets" |
631 |
| -
|
632 |
| - model (bigframes.ml.llm.GeminiTextGenerator): |
633 |
| - A GeminiTextGenerator provided by the Bigframes ML package. |
634 |
| -
|
635 |
| - k (int, default 10): |
636 |
| - The number of rows to return. |
637 |
| -
|
638 |
| - ground_with_google_search (bool, default False): |
639 |
| - Enables Grounding with Google Search for the GeminiTextGenerator model. |
640 |
| - When set to True, the model incorporates relevant information from Google |
641 |
| - Search results into its responses, enhancing their accuracy and factualness. |
642 |
| - Note: Using this feature may impact billing costs. Refer to the pricing |
643 |
| - page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models |
644 |
| - The default is `False`. |
645 |
| -
|
646 |
| - Returns: |
647 |
| - bigframes.dataframe.DataFrame: A new DataFrame with the top k rows. |
648 |
| -
|
649 |
| - Raises: |
650 |
| - NotImplementedError: when the AI operator experiment is off. |
651 |
| - ValueError: when the instruction refers to a non-existing column, or when no |
652 |
| - columns are referred to. |
653 |
| - """ |
654 |
| - if not options.experiments.ai_operators: |
655 |
| - raise NotImplementedError() |
656 |
| - |
657 |
| - import bigframes.dataframe |
658 |
| - import bigframes.series |
659 |
| - |
660 |
| - self._validate_model(model) |
661 |
| - columns = self._parse_columns(instruction) |
662 |
| - for column in columns: |
663 |
| - if column not in self._df.columns: |
664 |
| - raise ValueError(f"Column {column} not found.") |
665 |
| - if len(columns) > 1: |
666 |
| - raise NotImplementedError("AI top K are limited to a single column.") |
667 |
| - |
668 |
| - if ground_with_google_search: |
669 |
| - msg = exceptions.format_message( |
670 |
| - "Enables Grounding with Google Search may impact billing cost. See pricing " |
671 |
| - "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" |
672 |
| - ) |
673 |
| - warnings.warn(msg, category=UserWarning) |
674 |
| - |
675 |
| - work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) |
676 |
| - self._confirm_operation(work_estimate) |
677 |
| - |
678 |
| - df: bigframes.dataframe.DataFrame = self._df[columns].copy() |
679 |
| - column = columns[0] |
680 |
| - if df[column].dtype != dtypes.STRING_DTYPE: |
681 |
| - df[column] = df[column].astype(dtypes.STRING_DTYPE) |
682 |
| - |
683 |
| - # `index` is reserved for the `reset_index` below. |
684 |
| - if column == "index": |
685 |
| - raise ValueError( |
686 |
| - "Column name 'index' is reserved. Please choose a different name." |
687 |
| - ) |
688 |
| - |
689 |
| - if k < 1: |
690 |
| - raise ValueError("k must be an integer greater than or equal to 1.") |
691 |
| - |
692 |
| - user_instruction = self._format_instruction(instruction, columns) |
693 |
| - |
694 |
| - n = df.shape[0] |
695 |
| - if k >= n: |
696 |
| - return df |
697 |
| - |
698 |
| - # Create a unique index and duplicate it as the "index" column. This workaround |
699 |
| - # is needed for the select search algorithm due to unimplemented bigFrame methods. |
700 |
| - df = df.reset_index().rename(columns={"index": "old_index"}).reset_index() |
701 |
| - |
702 |
| - # Initialize a status column to track the selection status of each item. |
703 |
| - # - None: Unknown/not yet processed |
704 |
| - # - 1.0: Selected as part of the top-k items |
705 |
| - # - -1.0: Excluded from the top-k items |
706 |
| - status_column = guid.generate_guid("status") |
707 |
| - df[status_column] = bigframes.series.Series( |
708 |
| - None, dtype=dtypes.FLOAT_DTYPE, session=df._session |
709 |
| - ) |
710 |
| - |
711 |
| - num_selected = 0 |
712 |
| - while num_selected < k: |
713 |
| - df, num_new_selected = self._topk_partition( |
714 |
| - df, |
715 |
| - column, |
716 |
| - status_column, |
717 |
| - user_instruction, |
718 |
| - model, |
719 |
| - k - num_selected, |
720 |
| - ground_with_google_search, |
721 |
| - ) |
722 |
| - num_selected += num_new_selected |
723 |
| - |
724 |
| - result_df: bigframes.dataframe.DataFrame = self._df.copy() |
725 |
| - return result_df[df.set_index("old_index")[status_column] > 0.0] |
726 |
| - |
727 |
| - @staticmethod |
728 |
| - def _topk_partition( |
729 |
| - df, |
730 |
| - column: str, |
731 |
| - status_column: str, |
732 |
| - user_instruction: str, |
733 |
| - model, |
734 |
| - k: int, |
735 |
| - ground_with_google_search: bool, |
736 |
| - ): |
737 |
| - output_instruction = ( |
738 |
| - "Given a question and two documents, choose the document that best answers " |
739 |
| - "the question. Respond with 'Document 1' or 'Document 2'. You must choose " |
740 |
| - "one, even if neither is ideal. " |
741 |
| - ) |
742 |
| - |
743 |
| - # Random pivot selection for improved average quickselect performance. |
744 |
| - pending_df = df[df[status_column].isna()] |
745 |
| - pivot_iloc = np.random.randint(0, pending_df.shape[0]) |
746 |
| - pivot_index = pending_df.iloc[pivot_iloc]["index"] |
747 |
| - pivot_df = pending_df[pending_df["index"] == pivot_index] |
748 |
| - |
749 |
| - # Build a prompt to compare the pivot item's relevance to other pending items. |
750 |
| - prompt_s = pending_df[pending_df["index"] != pivot_index][column] |
751 |
| - prompt_s = ( |
752 |
| - f"{output_instruction}\n\nQuestion: {user_instruction}\n" |
753 |
| - + f"\nDocument 1: {column} " |
754 |
| - + pivot_df.iloc[0][column] |
755 |
| - + f"\nDocument 2: {column} " |
756 |
| - + prompt_s # type:ignore |
757 |
| - ) |
758 |
| - |
759 |
| - import bigframes.dataframe |
760 |
| - |
761 |
| - predict_df = typing.cast( |
762 |
| - bigframes.dataframe.DataFrame, |
763 |
| - model.predict( |
764 |
| - prompt_s, |
765 |
| - temperature=0.0, |
766 |
| - ground_with_google_search=ground_with_google_search, |
767 |
| - ), |
768 |
| - ) |
769 |
| - |
770 |
| - marks = predict_df["ml_generate_text_llm_result"].str.contains("2") |
771 |
| - more_relavant: bigframes.dataframe.DataFrame = df[marks] |
772 |
| - less_relavent: bigframes.dataframe.DataFrame = df[~marks] |
773 |
| - |
774 |
| - num_more_relavant = more_relavant.shape[0] |
775 |
| - if k < num_more_relavant: |
776 |
| - less_relavent[status_column] = -1.0 |
777 |
| - pivot_df[status_column] = -1.0 |
778 |
| - df = df.combine_first(less_relavent).combine_first(pivot_df) |
779 |
| - return df, 0 |
780 |
| - else: # k >= num_more_relavant |
781 |
| - more_relavant[status_column] = 1.0 |
782 |
| - df = df.combine_first(more_relavant) |
783 |
| - if k >= num_more_relavant + 1: |
784 |
| - pivot_df[status_column] = 1.0 |
785 |
| - df = df.combine_first(pivot_df) |
786 |
| - return df, num_more_relavant + 1 |
787 |
| - else: |
788 |
| - return df, num_more_relavant |
789 |
| - |
790 | 587 | def sim_join(
|
791 | 588 | self,
|
792 | 589 | other,
|
|
0 commit comments