Skip to content

Commit 7b2987e

Browse files
authored
feat/env var user agent (#452)
* use env var for user agent with default * expose github connector * update changelog * fix github * fix version * fix github * drop user agent as field
1 parent 1fbc2e9 commit 7b2987e

File tree

6 files changed

+29
-25
lines changed

6 files changed

+29
-25
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.6.4
2+
3+
### Features
4+
5+
* **Support env var for user agent settings**
6+
7+
### Fixes
8+
9+
* **Expose Github connector**
10+
111
## 0.6.3
212

313
### Features

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.3" # pragma: no cover
1+
__version__ = "0.6.4" # pragma: no cover

unstructured_ingest/v2/processes/connectors/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
from .delta_table import delta_table_destination_entry
3232
from .discord import CONNECTOR_TYPE as DISCORD_CONNECTOR_TYPE
3333
from .discord import discord_source_entry
34+
from .github import CONNECTOR_TYPE as GITHUB_CONNECTOR_TYPE
35+
from .github import github_source_entry
3436
from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
3537
from .gitlab import gitlab_source_entry
3638
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -124,3 +126,4 @@
124126
add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
125127

126128
add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
129+
add_source_entry(source_type=GITHUB_CONNECTOR_TYPE, entry=github_source_entry)

unstructured_ingest/v2/processes/connectors/databricks/volumes.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
6868
"Databricks workspace endpoint or the "
6969
"Databricks accounts endpoint.",
7070
)
71-
user_agent: str = "unstructuredio_oss"
7271

7372
def wrap_error(self, e: Exception) -> Exception:
7473
from databricks.sdk.errors.base import DatabricksError
@@ -101,7 +100,9 @@ def get_client(self) -> "WorkspaceClient":
101100
config = Config(
102101
host=self.host,
103102
**self.access_config.get_secret_value().model_dump(),
104-
).with_user_agent_extra("PyDatabricksSdk", self.user_agent)
103+
).with_user_agent_extra(
104+
"PyDatabricksSdk", os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
105+
)
105106

106107
return WorkspaceClient(config=config)
107108

unstructured_ingest/v2/processes/connectors/github.py

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
import fnmatch
21
from dataclasses import dataclass, field
32
from pathlib import Path
43
from time import time
54
from typing import TYPE_CHECKING, Any, Generator, Optional
65
from urllib.parse import urlparse
76
from uuid import NAMESPACE_DNS, uuid5
87

9-
import requests
108
from pydantic import Field, Secret, field_validator
119

1210
from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -34,6 +32,7 @@
3432
from github import ContentFile, GitTreeElement, Repository
3533
from github import Github as GithubClient
3634
from github.GithubException import GithubException
35+
from requests import HTTPError
3736

3837
CONNECTOR_TYPE = "github"
3938

@@ -77,7 +76,7 @@ def wrap_github_exception(self, e: "GithubException") -> Exception:
7776
logger.debug(f"unhandled github error: {e}")
7877
return e
7978

80-
def wrap_http_error(self, e: requests.HTTPError) -> Exception:
79+
def wrap_http_error(self, e: "HTTPError") -> Exception:
8180
status_code = e.response.status_code
8281
if status_code == 401:
8382
return UserAuthError(f"Unauthorized access to Github: {e.response.text}")
@@ -88,12 +87,14 @@ def wrap_http_error(self, e: requests.HTTPError) -> Exception:
8887
logger.debug(f"unhandled http error: {e}")
8988
return e
9089

90+
@requires_dependencies(["requests"], extras="github")
9191
def wrap_error(self, e: Exception) -> Exception:
9292
from github.GithubException import GithubException
93+
from requests import HTTPError
9394

9495
if isinstance(e, GithubException):
9596
return self.wrap_github_exception(e=e)
96-
if isinstance(e, requests.HTTPError):
97+
if isinstance(e, HTTPError):
9798
return self.wrap_http_error(e=e)
9899
logger.debug(f"unhandled error: {e}")
99100
return e
@@ -106,11 +107,6 @@ class GithubIndexerConfig(IndexerConfig):
106107
recursive: bool = Field(
107108
description="Recursively index all files in the repository", default=True
108109
)
109-
file_glob: Optional[list[str]] = Field(
110-
default=None,
111-
description="file globs to limit which types of " "files are accepted",
112-
examples=["*.pdf", "*.html"],
113-
)
114110

115111

116112
@dataclass
@@ -137,19 +133,8 @@ def list_files(self) -> list["GitTreeElement"]:
137133
file_elements = [
138134
element for element in git_tree.tree if element.size is not None and element.size > 0
139135
]
140-
if self.index_config.file_glob:
141-
file_elements = self.filter_files(files=file_elements)
142136
return file_elements
143137

144-
def filter_files(self, files: list["GitTreeElement"]) -> list["GitTreeElement"]:
145-
filtered_files = []
146-
for file in files:
147-
path = file.path
148-
for pattern in self.index_config.file_glob:
149-
if fnmatch.filter([path], pattern):
150-
filtered_files.append(file)
151-
return filtered_files
152-
153138
def convert_element(self, element: "GitTreeElement") -> FileData:
154139
full_path = (
155140
f"{self.connection_config.get_full_url()}/blob/{self.get_branch()}/{element.path}"
@@ -204,7 +189,10 @@ def get_file(self, file_data: FileData) -> "ContentFile":
204189
raise UserError(f"File not found: {path}")
205190
return content_file
206191

192+
@requires_dependencies(["requests"], extras="github")
207193
def get_contents(self, content_file: "ContentFile") -> bytes:
194+
import requests
195+
208196
if content_file.decoded_content:
209197
return content_file.decoded_content
210198
download_url = content_file.download_url

unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import os
23
from contextlib import contextmanager
34
from dataclasses import dataclass
45
from typing import TYPE_CHECKING, Any, Generator, Optional
@@ -42,7 +43,6 @@ class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
4243
access_config: Secret[DatabricksDeltaTablesAccessConfig]
4344
server_hostname: str = Field(description="server hostname connection config value")
4445
http_path: str = Field(description="http path connection config value")
45-
user_agent: str = "unstructuredio_oss"
4646

4747
@requires_dependencies(["databricks"], extras="databricks-delta-tables")
4848
def get_credentials_provider(self) -> "oauth_service_principal":
@@ -86,7 +86,9 @@ def get_connection(self, **connect_kwargs) -> Generator["DeltaTableConnection",
8686
from databricks.sql import connect
8787

8888
connect_kwargs = connect_kwargs or {}
89-
connect_kwargs["_user_agent_entry"] = self.user_agent
89+
connect_kwargs["_user_agent_entry"] = os.getenv(
90+
"UNSTRUCTURED_USER_AGENT", "unstructuredio_oss"
91+
)
9092
connect_kwargs["server_hostname"] = connect_kwargs.get(
9193
"server_hostname", self.server_hostname
9294
)

0 commit comments

Comments
 (0)