airbytehq
diff --git a/‎.github/workflows/python_lint.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/python_lint.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎airbyte_cdk/cli/airbyte_cdk/_secrets.py‎
Lines changed: 8 additions & 0 deletions b/‎airbyte_cdk/cli/airbyte_cdk/_secrets.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎airbyte_cdk/config_observation.py‎
Lines changed: 2 additions & 0 deletions b/‎airbyte_cdk/config_observation.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎airbyte_cdk/connector_builder/test_reader/reader.py‎
Lines changed: 1 addition & 0 deletions b/‎airbyte_cdk/connector_builder/test_reader/reader.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎airbyte_cdk/destinations/vector_db_based/config.py‎
Lines changed: 11 additions & 0 deletions b/‎airbyte_cdk/destinations/vector_db_based/config.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎airbyte_cdk/destinations/vector_db_based/document_processor.py‎
Lines changed: 7 additions & 0 deletions b/‎airbyte_cdk/destinations/vector_db_based/document_processor.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎airbyte_cdk/destinations/vector_db_based/writer.py‎
Lines changed: 5 additions & 0 deletions b/‎airbyte_cdk/destinations/vector_db_based/writer.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎airbyte_cdk/entrypoint.py‎
Lines changed: 4 additions & 0 deletions b/‎airbyte_cdk/entrypoint.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎airbyte_cdk/logger.py‎
Lines changed: 2 additions & 0 deletions b/‎airbyte_cdk/logger.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎airbyte_cdk/sources/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎airbyte_cdk/sources/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -53,8 +53,8 @@ jobs:
       - name: Check code format
         run: poetry run ruff format --diff .
 
-  mypy-check:
-    name: MyPy Check
+  pyrefly-check:
+    name: Pyrefly Check
     runs-on: ubuntu-24.04
     steps:
       # Common steps:
@@ -74,5 +74,5 @@ jobs:
 
       # Job-specific step(s):
 
-      - name: Run mypy
-        run: poetry run mypy --config-file mypy.ini airbyte_cdk
+      - name: Run pyrefly
+        run: poetry run pyrefly check airbyte_cdk
@@ -261,12 +261,15 @@ def list_(
     table.add_column("Labels", justify="left", style="magenta", overflow="fold")
     table.add_column("Created", justify="left", style="blue", overflow="fold")
     for secret in secrets:
+        # pyrefly: ignore  # missing-attribute
         full_secret_name = secret.name
         secret_name = _extract_secret_name(full_secret_name)
         secret_url = _get_secret_url(secret_name, gcp_project_id)
         table.add_row(
             f"[link={secret_url}]{secret_name}[/link]",
+            # pyrefly: ignore  # missing-attribute
             "\n".join([f"{k}={v}" for k, v in secret.labels.items()]),
+            # pyrefly: ignore  # missing-attribute
             str(secret.create_time),
         )
 
@@ -360,6 +363,7 @@ def _write_secret_file(
     """
     # List all enabled versions of the secret.
     response = client.list_secret_versions(
+        # pyrefly: ignore  # missing-attribute
         request={"parent": secret.name, "filter": "state:ENABLED"}
     )
 
@@ -368,6 +372,7 @@ def _write_secret_file(
     versions = list(response)
 
     if not versions:
+        # pyrefly: ignore  # missing-attribute
         secret_name = _extract_secret_name(secret.name)
         raise ConnectorSecretWithNoValidVersionsError(
             connector_name=connector_name,
@@ -404,7 +409,9 @@ def _get_secret_filepath(
     secret: Secret,  # type: ignore
 ) -> Path:
     """Get the file path for a secret based on its labels."""
+    # pyrefly: ignore  # missing-attribute, unsupported-operation
     if secret.labels and "filename" in secret.labels:
+        # pyrefly: ignore  # index-error
         return secrets_dir / f"{secret.labels['filename']}.json"
 
     return secrets_dir / "config.json"  # Default filename
@@ -468,6 +475,7 @@ def _print_ci_secrets_masks_for_config(
             _print_ci_secrets_masks_for_config(item)
 
     if isinstance(config, dict):
+        # pyrefly: ignore  # bad-assignment
         for key, value in config.items():
             if _is_secret_property(key):
                 logger.debug(f"Masking secret for config key: {key}")
 
@@ -67,6 +67,7 @@ class ConfigObserver:
     """
 
     def set_config(self, config: ObservedDict) -> None:
+        # pyrefly: ignore  # implicitly-defined-attribute
         self.config = config
 
     def update(self) -> None:
@@ -99,6 +100,7 @@ def create_connector_config_control_message(config: MutableMapping[str, Any]) ->
     control_message = AirbyteControlMessage(
         type=OrchestratorType.CONNECTOR_CONFIG,
         emitted_at=time.time() * 1000,
+        # pyrefly: ignore  # bad-argument-type
         connectorConfig=AirbyteControlConnectorConfigMessage(config=config),
     )
     return AirbyteMessage(type=Type.CONTROL, control=control_message)
@@ -249,6 +249,7 @@ def _check_record_limit(self, record_limit: Optional[int] = None) -> int:
         if record_limit is None:
             record_limit = self._max_record_limit
         else:
+            # pyrefly: ignore  # no-matching-overload
             record_limit = min(record_limit, self._max_record_limit)
 
         return record_limit
 
@@ -24,6 +24,7 @@ class SeparatorSplitterConfigModel(BaseModel):
         description="Whether to keep the separator in the resulting chunks",
     )
 
+    # pyrefly: ignore  # bad-override
     class Config(OneOfOptionConfig):
         title = "By Separator"
         description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
@@ -40,6 +41,7 @@ class MarkdownHeaderSplitterConfigModel(BaseModel):
         ge=1,
     )
 
+    # pyrefly: ignore  # bad-override
     class Config(OneOfOptionConfig):
         title = "By Markdown header"
         description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
@@ -71,6 +73,7 @@ class CodeSplitterConfigModel(BaseModel):
         ],
     )
 
+    # pyrefly: ignore  # bad-override
     class Config(OneOfOptionConfig):
         title = "By Programming Language"
         description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
@@ -129,6 +132,7 @@ class ProcessingConfigModel(BaseModel):
         description="List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.",
     )
 
+    # pyrefly: ignore  # bad-override
     class Config:
         schema_extra = {"group": "processing"}
 
@@ -137,6 +141,7 @@ class OpenAIEmbeddingConfigModel(BaseModel):
     mode: Literal["openai"] = Field("openai", const=True)
     openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True)
 
+    # pyrefly: ignore  # bad-override
     class Config(OneOfOptionConfig):
         title = "OpenAI"
         description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
@@ -164,6 +169,7 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
         examples=[1536, 384],
     )
 
+    # pyrefly: ignore  # bad-override
     class Config(OneOfOptionConfig):
         title = "OpenAI-compatible"
         description = "Use a service that's compatible with the OpenAI API to embed text."
@@ -191,6 +197,7 @@ class AzureOpenAIEmbeddingConfigModel(BaseModel):
         examples=["your-resource-name"],
     )
 
+    # pyrefly: ignore  # bad-override
     class Config(OneOfOptionConfig):
         title = "Azure OpenAI"
         description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
@@ -200,6 +207,7 @@ class Config(OneOfOptionConfig):
 class FakeEmbeddingConfigModel(BaseModel):
     mode: Literal["fake"] = Field("fake", const=True)
 
+    # pyrefly: ignore  # bad-override
     class Config(OneOfOptionConfig):
         title = "Fake"
         description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
@@ -221,6 +229,7 @@ class FromFieldEmbeddingConfigModel(BaseModel):
         examples=[1536, 384],
     )
 
+    # pyrefly: ignore  # bad-override
     class Config(OneOfOptionConfig):
         title = "From Field"
         description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
@@ -231,6 +240,7 @@ class CohereEmbeddingConfigModel(BaseModel):
     mode: Literal["cohere"] = Field("cohere", const=True)
     cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
 
+    # pyrefly: ignore  # bad-override
     class Config(OneOfOptionConfig):
         title = "Cohere"
         description = "Use the Cohere API to embed text."
@@ -273,6 +283,7 @@ class VectorDBConfigModel(BaseModel):
         description="Do not store the text that gets embedded along with the vector and the metadata in the destination. If set to true, only the vector and the metadata will be stored - in this case raw text for LLM use cases needs to be retrieved from another source.",
     )
 
+    # pyrefly: ignore  # bad-override
     class Config:
         title = "Destination Config"
         schema_extra = {
 
@@ -71,6 +71,7 @@ class DocumentProcessor:
     @staticmethod
     def check_config(config: ProcessingConfigModel) -> Optional[str]:
         if config.text_splitter is not None and config.text_splitter.mode == "separator":
+            # pyrefly: ignore  # missing-attribute
             for s in config.text_splitter.separators:
                 try:
                     separator = json.loads(s)
@@ -85,21 +86,25 @@ def _get_text_splitter(
         chunk_size: int,
         chunk_overlap: int,
         splitter_config: Optional[TextSplitterConfigModel],
+    # pyrefly: ignore  # bad-return
     ) -> RecursiveCharacterTextSplitter:
         if splitter_config is None:
             splitter_config = SeparatorSplitterConfigModel(mode="separator")
         if splitter_config.mode == "separator":
             return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                 chunk_size=chunk_size,
                 chunk_overlap=chunk_overlap,
+                # pyrefly: ignore  # missing-attribute
                 separators=[json.loads(s) for s in splitter_config.separators],
+                # pyrefly: ignore  # missing-attribute
                 keep_separator=splitter_config.keep_separator,
                 disallowed_special=(),
             )
         if splitter_config.mode == "markdown":
             return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                 chunk_size=chunk_size,
                 chunk_overlap=chunk_overlap,
+                # pyrefly: ignore  # missing-attribute
                 separators=headers_to_split_on[: splitter_config.split_level],
                 is_separator_regex=True,
                 keep_separator=True,
@@ -110,6 +115,7 @@ def _get_text_splitter(
                 chunk_size=chunk_size,
                 chunk_overlap=chunk_overlap,
                 separators=RecursiveCharacterTextSplitter.get_separators_for_language(
+                    # pyrefly: ignore  # bad-argument-type, missing-attribute
                     Language(splitter_config.language)
                 ),
                 disallowed_special=(),
@@ -218,6 +224,7 @@ def _remap_field_names(self, fields: Dict[str, Any]) -> Dict[str, Any]:
         new_fields = fields.copy()
         for mapping in self.field_name_mappings:
             if mapping.from_field in new_fields:
+                # pyrefly: ignore  # missing-attribute
                 new_fields[mapping.to_field] = new_fields.pop(mapping.from_field)
 
         return new_fields
@@ -42,8 +42,11 @@ def __init__(
         self._init_batch()
 
     def _init_batch(self) -> None:
+        # pyrefly: ignore  # implicitly-defined-attribute
         self.chunks: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list)
+        # pyrefly: ignore  # implicitly-defined-attribute
         self.ids_to_delete: Dict[Tuple[str, str], List[str]] = defaultdict(list)
+        # pyrefly: ignore  # implicitly-defined-attribute
         self.number_of_chunks = 0
 
     def _convert_to_document(self, chunk: Chunk) -> Document:
@@ -73,6 +76,7 @@ def _process_batch(self) -> None:
     def write(
         self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]
     ) -> Iterable[AirbyteMessage]:
+        # pyrefly: ignore  # implicitly-defined-attribute
         self.processor = DocumentProcessor(self.processing_config, configured_catalog)
         self.indexer.pre_sync(configured_catalog)
         for message in input_messages:
@@ -82,6 +86,7 @@ def write(
                 self._process_batch()
                 yield message
             elif message.type == Type.RECORD:
+                # pyrefly: ignore  # bad-argument-type
                 record_chunks, record_id_to_delete = self.processor.process(message.record)
                 self.chunks[
                     (  # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
 
@@ -363,7 +363,9 @@ def extract_config(cls, args: List[str]) -> Optional[Any]:
         return None
 
     def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]:
+        # pyrefly: ignore  # missing-attribute
         if hasattr(source, "message_repository") and source.message_repository:
+            # pyrefly: ignore  # missing-attribute
             yield from source.message_repository.consume_queue()
         return
 
@@ -373,6 +375,7 @@ def launch(source: Source, args: List[str]) -> None:
     parsed_args = source_entrypoint.parse_args(args)
     # temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs
     # Refer to: https://github.com/airbytehq/oncall/issues/6235
+    # pyrefly: ignore  # bad-context-manager
     with PRINT_BUFFER:
         for message in source_entrypoint.run(parsed_args):
             # simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
@@ -388,6 +391,7 @@ def _init_internal_request_filter() -> None:
 
     @wraps(wrapped_fn)
     def filtered_send(self: Any, request: PreparedRequest, **kwargs: Any) -> Response:
+        # pyrefly: ignore  # no-matching-overload
         parsed_url = urlparse(request.url)
 
         if parsed_url.scheme not in VALID_URL_SCHEMES:
 
@@ -79,6 +79,7 @@ def format(self, record: logging.LogRecord) -> str:
             message = super().format(record)
             message = filter_secrets(message)
             log_message = AirbyteMessage(
+                # pyrefly: ignore  # bad-argument-type
                 type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message)
             )
             return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode()
@@ -100,6 +101,7 @@ def log_by_prefix(msg: str, default_level: str) -> Tuple[int, str]:
     split_line = msg.split()
     first_word = next(iter(split_line), None)
     if first_word in valid_log_types:
+        # pyrefly: ignore  # no-matching-overload
         log_level = logging.getLevelName(first_word)
         rendered_message = " ".join(split_line[1:])
     else:
 
@@ -17,6 +17,7 @@
 # might not need dpath leading to longer initialization time.
 # There is a downside in using dpath as a library since the options are global: if we have two pieces of code that want different options,
 # this will not be thread-safe.
+# pyrefly: ignore  # bad-assignment
 dpath.options.ALLOW_EMPTY_STRING_KEYS = True
 
 __all__ = [