Skip to content

Commit 1d9fb17

Browse files
feat: replace mypy with pyrefly type checker for testing
- Replace mypy dependency with pyrefly ^0.26.0 - Update CI workflow from mypy-check to pyrefly-check - Update type-check Poe task to use pyrefly - Add pyrefly configuration with python_version and project_excludes - Use pyrefly --suppress-errors to auto-add noqa comments for legacy code - Test integration shows 0 errors with 530 ignored violations Co-Authored-By: AJ Steers <[email protected]>
1 parent a59d25f commit 1d9fb17

File tree

87 files changed

+515
-182
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+515
-182
lines changed

.github/workflows/python_lint.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ jobs:
5353
- name: Check code format
5454
run: poetry run ruff format --diff .
5555

56-
mypy-check:
57-
name: MyPy Check
56+
pyrefly-check:
57+
name: Pyrefly Check
5858
runs-on: ubuntu-24.04
5959
steps:
6060
# Common steps:
@@ -74,5 +74,5 @@ jobs:
7474

7575
# Job-specific step(s):
7676

77-
- name: Run mypy
78-
run: poetry run mypy --config-file mypy.ini airbyte_cdk
77+
- name: Run pyrefly
78+
run: poetry run pyrefly check airbyte_cdk

airbyte_cdk/cli/airbyte_cdk/_secrets.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,12 +261,15 @@ def list_(
261261
table.add_column("Labels", justify="left", style="magenta", overflow="fold")
262262
table.add_column("Created", justify="left", style="blue", overflow="fold")
263263
for secret in secrets:
264+
# pyrefly: ignore # missing-attribute
264265
full_secret_name = secret.name
265266
secret_name = _extract_secret_name(full_secret_name)
266267
secret_url = _get_secret_url(secret_name, gcp_project_id)
267268
table.add_row(
268269
f"[link={secret_url}]{secret_name}[/link]",
270+
# pyrefly: ignore # missing-attribute
269271
"\n".join([f"{k}={v}" for k, v in secret.labels.items()]),
272+
# pyrefly: ignore # missing-attribute
270273
str(secret.create_time),
271274
)
272275

@@ -360,6 +363,7 @@ def _write_secret_file(
360363
"""
361364
# List all enabled versions of the secret.
362365
response = client.list_secret_versions(
366+
# pyrefly: ignore # missing-attribute
363367
request={"parent": secret.name, "filter": "state:ENABLED"}
364368
)
365369

@@ -368,6 +372,7 @@ def _write_secret_file(
368372
versions = list(response)
369373

370374
if not versions:
375+
# pyrefly: ignore # missing-attribute
371376
secret_name = _extract_secret_name(secret.name)
372377
raise ConnectorSecretWithNoValidVersionsError(
373378
connector_name=connector_name,
@@ -404,7 +409,9 @@ def _get_secret_filepath(
404409
secret: Secret, # type: ignore
405410
) -> Path:
406411
"""Get the file path for a secret based on its labels."""
412+
# pyrefly: ignore # missing-attribute, unsupported-operation
407413
if secret.labels and "filename" in secret.labels:
414+
# pyrefly: ignore # index-error
408415
return secrets_dir / f"{secret.labels['filename']}.json"
409416

410417
return secrets_dir / "config.json" # Default filename
@@ -468,6 +475,7 @@ def _print_ci_secrets_masks_for_config(
468475
_print_ci_secrets_masks_for_config(item)
469476

470477
if isinstance(config, dict):
478+
# pyrefly: ignore # bad-assignment
471479
for key, value in config.items():
472480
if _is_secret_property(key):
473481
logger.debug(f"Masking secret for config key: {key}")

airbyte_cdk/config_observation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class ConfigObserver:
6767
"""
6868

6969
def set_config(self, config: ObservedDict) -> None:
70+
# pyrefly: ignore # implicitly-defined-attribute
7071
self.config = config
7172

7273
def update(self) -> None:
@@ -99,6 +100,7 @@ def create_connector_config_control_message(config: MutableMapping[str, Any]) ->
99100
control_message = AirbyteControlMessage(
100101
type=OrchestratorType.CONNECTOR_CONFIG,
101102
emitted_at=time.time() * 1000,
103+
# pyrefly: ignore # bad-argument-type
102104
connectorConfig=AirbyteControlConnectorConfigMessage(config=config),
103105
)
104106
return AirbyteMessage(type=Type.CONTROL, control=control_message)

airbyte_cdk/connector_builder/test_reader/reader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ def _check_record_limit(self, record_limit: Optional[int] = None) -> int:
249249
if record_limit is None:
250250
record_limit = self._max_record_limit
251251
else:
252+
# pyrefly: ignore # no-matching-overload
252253
record_limit = min(record_limit, self._max_record_limit)
253254

254255
return record_limit

airbyte_cdk/destinations/vector_db_based/config.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class SeparatorSplitterConfigModel(BaseModel):
2424
description="Whether to keep the separator in the resulting chunks",
2525
)
2626

27+
# pyrefly: ignore # bad-override
2728
class Config(OneOfOptionConfig):
2829
title = "By Separator"
2930
description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
@@ -40,6 +41,7 @@ class MarkdownHeaderSplitterConfigModel(BaseModel):
4041
ge=1,
4142
)
4243

44+
# pyrefly: ignore # bad-override
4345
class Config(OneOfOptionConfig):
4446
title = "By Markdown header"
4547
description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
@@ -71,6 +73,7 @@ class CodeSplitterConfigModel(BaseModel):
7173
],
7274
)
7375

76+
# pyrefly: ignore # bad-override
7477
class Config(OneOfOptionConfig):
7578
title = "By Programming Language"
7679
description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
@@ -129,6 +132,7 @@ class ProcessingConfigModel(BaseModel):
129132
description="List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.",
130133
)
131134

135+
# pyrefly: ignore # bad-override
132136
class Config:
133137
schema_extra = {"group": "processing"}
134138

@@ -137,6 +141,7 @@ class OpenAIEmbeddingConfigModel(BaseModel):
137141
mode: Literal["openai"] = Field("openai", const=True)
138142
openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True)
139143

144+
# pyrefly: ignore # bad-override
140145
class Config(OneOfOptionConfig):
141146
title = "OpenAI"
142147
description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
@@ -164,6 +169,7 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
164169
examples=[1536, 384],
165170
)
166171

172+
# pyrefly: ignore # bad-override
167173
class Config(OneOfOptionConfig):
168174
title = "OpenAI-compatible"
169175
description = "Use a service that's compatible with the OpenAI API to embed text."
@@ -191,6 +197,7 @@ class AzureOpenAIEmbeddingConfigModel(BaseModel):
191197
examples=["your-resource-name"],
192198
)
193199

200+
# pyrefly: ignore # bad-override
194201
class Config(OneOfOptionConfig):
195202
title = "Azure OpenAI"
196203
description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
@@ -200,6 +207,7 @@ class Config(OneOfOptionConfig):
200207
class FakeEmbeddingConfigModel(BaseModel):
201208
mode: Literal["fake"] = Field("fake", const=True)
202209

210+
# pyrefly: ignore # bad-override
203211
class Config(OneOfOptionConfig):
204212
title = "Fake"
205213
description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
@@ -221,6 +229,7 @@ class FromFieldEmbeddingConfigModel(BaseModel):
221229
examples=[1536, 384],
222230
)
223231

232+
# pyrefly: ignore # bad-override
224233
class Config(OneOfOptionConfig):
225234
title = "From Field"
226235
description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
@@ -231,6 +240,7 @@ class CohereEmbeddingConfigModel(BaseModel):
231240
mode: Literal["cohere"] = Field("cohere", const=True)
232241
cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
233242

243+
# pyrefly: ignore # bad-override
234244
class Config(OneOfOptionConfig):
235245
title = "Cohere"
236246
description = "Use the Cohere API to embed text."
@@ -273,6 +283,7 @@ class VectorDBConfigModel(BaseModel):
273283
description="Do not store the text that gets embedded along with the vector and the metadata in the destination. If set to true, only the vector and the metadata will be stored - in this case raw text for LLM use cases needs to be retrieved from another source.",
274284
)
275285

286+
# pyrefly: ignore # bad-override
276287
class Config:
277288
title = "Destination Config"
278289
schema_extra = {

airbyte_cdk/destinations/vector_db_based/document_processor.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class DocumentProcessor:
7171
@staticmethod
7272
def check_config(config: ProcessingConfigModel) -> Optional[str]:
7373
if config.text_splitter is not None and config.text_splitter.mode == "separator":
74+
# pyrefly: ignore # missing-attribute
7475
for s in config.text_splitter.separators:
7576
try:
7677
separator = json.loads(s)
@@ -85,21 +86,25 @@ def _get_text_splitter(
8586
chunk_size: int,
8687
chunk_overlap: int,
8788
splitter_config: Optional[TextSplitterConfigModel],
89+
# pyrefly: ignore # bad-return
8890
) -> RecursiveCharacterTextSplitter:
8991
if splitter_config is None:
9092
splitter_config = SeparatorSplitterConfigModel(mode="separator")
9193
if splitter_config.mode == "separator":
9294
return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
9395
chunk_size=chunk_size,
9496
chunk_overlap=chunk_overlap,
97+
# pyrefly: ignore # missing-attribute
9598
separators=[json.loads(s) for s in splitter_config.separators],
99+
# pyrefly: ignore # missing-attribute
96100
keep_separator=splitter_config.keep_separator,
97101
disallowed_special=(),
98102
)
99103
if splitter_config.mode == "markdown":
100104
return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
101105
chunk_size=chunk_size,
102106
chunk_overlap=chunk_overlap,
107+
# pyrefly: ignore # missing-attribute
103108
separators=headers_to_split_on[: splitter_config.split_level],
104109
is_separator_regex=True,
105110
keep_separator=True,
@@ -110,6 +115,7 @@ def _get_text_splitter(
110115
chunk_size=chunk_size,
111116
chunk_overlap=chunk_overlap,
112117
separators=RecursiveCharacterTextSplitter.get_separators_for_language(
118+
# pyrefly: ignore # bad-argument-type, missing-attribute
113119
Language(splitter_config.language)
114120
),
115121
disallowed_special=(),
@@ -218,6 +224,7 @@ def _remap_field_names(self, fields: Dict[str, Any]) -> Dict[str, Any]:
218224
new_fields = fields.copy()
219225
for mapping in self.field_name_mappings:
220226
if mapping.from_field in new_fields:
227+
# pyrefly: ignore # missing-attribute
221228
new_fields[mapping.to_field] = new_fields.pop(mapping.from_field)
222229

223230
return new_fields

airbyte_cdk/destinations/vector_db_based/writer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,11 @@ def __init__(
4242
self._init_batch()
4343

4444
def _init_batch(self) -> None:
45+
# pyrefly: ignore # implicitly-defined-attribute
4546
self.chunks: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list)
47+
# pyrefly: ignore # implicitly-defined-attribute
4648
self.ids_to_delete: Dict[Tuple[str, str], List[str]] = defaultdict(list)
49+
# pyrefly: ignore # implicitly-defined-attribute
4750
self.number_of_chunks = 0
4851

4952
def _convert_to_document(self, chunk: Chunk) -> Document:
@@ -73,6 +76,7 @@ def _process_batch(self) -> None:
7376
def write(
7477
self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]
7578
) -> Iterable[AirbyteMessage]:
79+
# pyrefly: ignore # implicitly-defined-attribute
7680
self.processor = DocumentProcessor(self.processing_config, configured_catalog)
7781
self.indexer.pre_sync(configured_catalog)
7882
for message in input_messages:
@@ -82,6 +86,7 @@ def write(
8286
self._process_batch()
8387
yield message
8488
elif message.type == Type.RECORD:
89+
# pyrefly: ignore # bad-argument-type
8590
record_chunks, record_id_to_delete = self.processor.process(message.record)
8691
self.chunks[
8792
( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"

airbyte_cdk/entrypoint.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,9 @@ def extract_config(cls, args: List[str]) -> Optional[Any]:
363363
return None
364364

365365
def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]:
366+
# pyrefly: ignore # missing-attribute
366367
if hasattr(source, "message_repository") and source.message_repository:
368+
# pyrefly: ignore # missing-attribute
367369
yield from source.message_repository.consume_queue()
368370
return
369371

@@ -373,6 +375,7 @@ def launch(source: Source, args: List[str]) -> None:
373375
parsed_args = source_entrypoint.parse_args(args)
374376
# temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs
375377
# Refer to: https://github.com/airbytehq/oncall/issues/6235
378+
# pyrefly: ignore # bad-context-manager
376379
with PRINT_BUFFER:
377380
for message in source_entrypoint.run(parsed_args):
378381
# simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
@@ -388,6 +391,7 @@ def _init_internal_request_filter() -> None:
388391

389392
@wraps(wrapped_fn)
390393
def filtered_send(self: Any, request: PreparedRequest, **kwargs: Any) -> Response:
394+
# pyrefly: ignore # no-matching-overload
391395
parsed_url = urlparse(request.url)
392396

393397
if parsed_url.scheme not in VALID_URL_SCHEMES:

airbyte_cdk/logger.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def format(self, record: logging.LogRecord) -> str:
7979
message = super().format(record)
8080
message = filter_secrets(message)
8181
log_message = AirbyteMessage(
82+
# pyrefly: ignore # bad-argument-type
8283
type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message)
8384
)
8485
return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode()
@@ -100,6 +101,7 @@ def log_by_prefix(msg: str, default_level: str) -> Tuple[int, str]:
100101
split_line = msg.split()
101102
first_word = next(iter(split_line), None)
102103
if first_word in valid_log_types:
104+
# pyrefly: ignore # no-matching-overload
103105
log_level = logging.getLevelName(first_word)
104106
rendered_message = " ".join(split_line[1:])
105107
else:

airbyte_cdk/sources/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# might not need dpath leading to longer initialization time.
1818
# There is a downside in using dpath as a library since the options are global: if we have two pieces of code that want different options,
1919
# this will not be thread-safe.
20+
# pyrefly: ignore # bad-assignment
2021
dpath.options.ALLOW_EMPTY_STRING_KEYS = True
2122

2223
__all__ = [

0 commit comments

Comments
 (0)