1- import fnmatch
21from dataclasses import dataclass , field
32from pathlib import Path
43from time import time
54from typing import TYPE_CHECKING , Any , Generator , Optional
65from urllib .parse import urlparse
76from uuid import NAMESPACE_DNS , uuid5
87
9- import requests
108from pydantic import Field , Secret , field_validator
119
1210from unstructured_ingest .utils .dep_check import requires_dependencies
3432 from github import ContentFile , GitTreeElement , Repository
3533 from github import Github as GithubClient
3634 from github .GithubException import GithubException
35+ from requests import HTTPError
3736
3837CONNECTOR_TYPE = "github"
3938
@@ -77,7 +76,7 @@ def wrap_github_exception(self, e: "GithubException") -> Exception:
7776 logger .debug (f"unhandled github error: { e } " )
7877 return e
7978
80- def wrap_http_error (self , e : requests . HTTPError ) -> Exception :
79+ def wrap_http_error (self , e : " HTTPError" ) -> Exception :
8180 status_code = e .response .status_code
8281 if status_code == 401 :
8382 return UserAuthError (f"Unauthorized access to Github: { e .response .text } " )
@@ -88,12 +87,14 @@ def wrap_http_error(self, e: requests.HTTPError) -> Exception:
8887 logger .debug (f"unhandled http error: { e } " )
8988 return e
9089
90+ @requires_dependencies (["requests" ], extras = "github" )
9191 def wrap_error (self , e : Exception ) -> Exception :
9292 from github .GithubException import GithubException
93+ from requests import HTTPError
9394
9495 if isinstance (e , GithubException ):
9596 return self .wrap_github_exception (e = e )
96- if isinstance (e , requests . HTTPError ):
97+ if isinstance (e , HTTPError ):
9798 return self .wrap_http_error (e = e )
9899 logger .debug (f"unhandled error: { e } " )
99100 return e
@@ -106,11 +107,6 @@ class GithubIndexerConfig(IndexerConfig):
106107 recursive : bool = Field (
107108 description = "Recursively index all files in the repository" , default = True
108109 )
109- file_glob : Optional [list [str ]] = Field (
110- default = None ,
111- description = "file globs to limit which types of " "files are accepted" ,
112- examples = ["*.pdf" , "*.html" ],
113- )
114110
115111
116112@dataclass
@@ -137,19 +133,8 @@ def list_files(self) -> list["GitTreeElement"]:
137133 file_elements = [
138134 element for element in git_tree .tree if element .size is not None and element .size > 0
139135 ]
140- if self .index_config .file_glob :
141- file_elements = self .filter_files (files = file_elements )
142136 return file_elements
143137
144- def filter_files (self , files : list ["GitTreeElement" ]) -> list ["GitTreeElement" ]:
145- filtered_files = []
146- for file in files :
147- path = file .path
148- for pattern in self .index_config .file_glob :
149- if fnmatch .filter ([path ], pattern ):
150- filtered_files .append (file )
151- return filtered_files
152-
153138 def convert_element (self , element : "GitTreeElement" ) -> FileData :
154139 full_path = (
155140 f"{ self .connection_config .get_full_url ()} /blob/{ self .get_branch ()} /{ element .path } "
@@ -204,7 +189,10 @@ def get_file(self, file_data: FileData) -> "ContentFile":
204189 raise UserError (f"File not found: { path } " )
205190 return content_file
206191
192+ @requires_dependencies (["requests" ], extras = "github" )
207193 def get_contents (self , content_file : "ContentFile" ) -> bytes :
194+ import requests
195+
208196 if content_file .decoded_content :
209197 return content_file .decoded_content
210198 download_url = content_file .download_url
0 commit comments