Skip to content

Commit 86c4291

Browse files
H-Huangmeta-codesync[bot]
authored andcommitted
Fix CI and open telemetry API usage (#306)
Summary: Pull Request resolved: #306 TorchFT CI is installing the latest `opentelemetry-sdk`, which caused breakages in some of the APIs in otel.py. This PR updates `otel.py` to use the correct APIs. This PR also fixes TorchFT lint (pyre check was failing). This should resolve the failing TorchFT CI failures: https://github.com/meta-pytorch/torchft/actions/runs/19983176027/job/57313269930 This is also causing downstream CI failures in in torchtitan: https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_torchft.yaml Pull Request resolved: #305 Reviewed By: tianyu-l Differential Revision: D89743414 Pulled By: H-Huang fbshipit-source-id: c066fe535b332ba94b918f5b684595c8de8b6740
1 parent ee51839 commit 86c4291

File tree

4 files changed

+36
-19
lines changed

4 files changed

+36
-19
lines changed

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ dynamic = ["version"]
1414
readme = "README.md"
1515
dependencies = [
1616
"torch>=2.7",
17-
"opentelemetry-exporter-otlp-proto-http>=1.37.0",
18-
"opentelemetry-sdk>=1.37.0",
19-
"opentelemetry-api>=1.37.0",
17+
"opentelemetry-exporter-otlp-proto-http>=1.39.0",
18+
"opentelemetry-sdk>=1.39.0",
19+
"opentelemetry-api>=1.39.0",
2020
]
2121

2222
[project.urls]

torchft/optim.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,5 +59,5 @@ def param_groups(self) -> List[Dict[str, Any]]:
5959
return self.optim.param_groups
6060

6161
@property
62-
def state(self) -> Mapping[torch.Tensor, Any]: # pyre-fixme[3]
62+
def state(self) -> Mapping[torch.Tensor, object]:
6363
return self.optim.state

torchft/otel.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,53 @@
88
import logging
99
import os
1010
import time
11-
from typing import List, Sequence
11+
from typing import Any, List, Sequence, TYPE_CHECKING
1212

1313
from opentelemetry._logs import set_logger_provider
1414

1515
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
1616
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
17-
from opentelemetry.sdk._logs._internal import LogData
18-
from opentelemetry.sdk._logs.export import (
19-
BatchLogRecordProcessor,
20-
ConsoleLogExporter,
21-
LogExporter,
22-
)
17+
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
2318
from opentelemetry.sdk.resources import Resource
2419

20+
# These types are available in opentelemetry-sdk but Pyre's type stubs
21+
# don't include them. We import them at runtime and provide type aliases for
22+
# static type checking.
23+
if TYPE_CHECKING:
24+
# pyre-fixme[33]: Aliasing to Any is prohibited. opentelemetry-sdk lacks type stubs.
25+
ReadableLogRecord = Any
26+
# pyre-fixme[33]: Aliasing to Any is prohibited. opentelemetry-sdk lacks type stubs.
27+
LogRecordExporter = Any
28+
# pyre-fixme[33]: Aliasing to Any is prohibited. opentelemetry-sdk lacks type stubs.
29+
LogRecordExportResult = Any
30+
# pyre-fixme[33]: Aliasing to Any is prohibited. opentelemetry-sdk lacks type stubs.
31+
ConsoleLogRecordExporter = Any
32+
else:
33+
from opentelemetry.sdk._logs import ReadableLogRecord
34+
from opentelemetry.sdk._logs.export import (
35+
ConsoleLogRecordExporter,
36+
LogRecordExporter,
37+
LogRecordExportResult,
38+
)
39+
2540
_LOGGER_PROVIDER: dict[str, LoggerProvider] = {}
2641
# Path to the file containing OTEL resource attributes
2742
TORCHFT_OTEL_RESOURCE_ATTRIBUTES_JSON = "TORCHFT_OTEL_RESOURCE_ATTRIBUTES_JSON"
2843

2944

30-
class TeeLogExporter(LogExporter):
45+
class TeeLogExporter(LogRecordExporter):
3146
"""Exporter that writes to multiple exporters."""
3247

3348
def __init__(
3449
self,
35-
exporters: List[LogExporter],
50+
exporters: List[LogRecordExporter],
3651
) -> None:
3752
self._exporters = exporters
3853

39-
def export(self, batch: Sequence[LogData]) -> None:
54+
def export(self, batch: Sequence[ReadableLogRecord]) -> LogRecordExportResult:
4055
for e in self._exporters:
4156
e.export(batch)
57+
return LogRecordExportResult.SUCCESS
4258

4359
def shutdown(self) -> None:
4460
for e in self._exporters:
@@ -49,8 +65,6 @@ def setup_logger(name: str) -> None:
4965
if os.environ.get("TORCHFT_USE_OTEL", "false") == "false":
5066
return
5167

52-
global _LOGGER_PROVIDER
53-
5468
if name in _LOGGER_PROVIDER:
5569
return
5670

@@ -70,7 +84,7 @@ def setup_logger(name: str) -> None:
7084

7185
exporter = TeeLogExporter(
7286
exporters=[
73-
ConsoleLogExporter(),
87+
ConsoleLogRecordExporter(),
7488
OTLPLogExporter(
7589
timeout=5,
7690
),

torchft/process_group_test.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def _test_pg(
7676
]
7777
tensor_list = [torch.empty_like(input_tensor)]
7878

79-
def check_tensors(arg: Any) -> None: # pyre-ignore[2]
79+
def check_tensors(arg: object) -> None:
8080
"""Recursively check tensors for expected shape and dtype."""
8181
if isinstance(arg, torch.Tensor):
8282
assert arg.dtype == dtype, f"Output dtype mismatch: {arg.dtype} != {dtype}"
@@ -738,7 +738,10 @@ def test_functional_collectives(self) -> None:
738738

739739
self.assertEqual(pg.group_name, str(dist.get_pg_count() - 1))
740740

741-
self.assertIs(_resolve_process_group(pg.group_name), pg)
741+
self.assertIs(
742+
_resolve_process_group(pg.group_name), # pyre-ignore[6]: GroupName vs str
743+
pg,
744+
)
742745

743746
try:
744747
t = torch.zeros(10)

0 commit comments

Comments
 (0)