Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ CLIENT_ID='[Your Azure Client ID]'
CLIENT_SECRET='[Your Azure Client Secret]'

# SharePoint Site Configuration
SITE_DOMAIN='[Your SharePoint Site Domain]'
SITE_HOSTNAME='[Your SharePoint Site Hostname]'
SITE_NAME='[Your SharePoint Site Name]'

# Azure AI Search Service Configuration
Expand All @@ -16,4 +16,4 @@ SEARCH_ADMIN_API_KEY='[Your Azure AI Search Admin API Key]'
#Optional settings for Vectorization of Content
OPEN_API_KEY='[Your Azure OpenAI API Key]'
OPEN_API_BASE='[Your Azure OpenAI Base URL -- https://XXX.openai.azure.com/]'
OPEN_API_MODEL='[Your Azure OpenAI Model Name]'
OPEN_API_DEPLOYMENT_NAME='[Your Azure OpenAI Model Name]'
14 changes: 7 additions & 7 deletions 01-indexing-content.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@
"CLIENT_SECRET='[Your Azure Client Secret]'\n",
"\n",
"# SharePoint Site Configuration\n",
"SITE_DOMAIN='[Your SharePoint Site Domain]'\n",
"SITE_HOSTNAME='[Your SharePoint Site Domain]'\n",
"SITE_NAME='[Your SharePoint Site Name]'\n",
"\n",
"# Azure AI Search Service Configuration\n",
Expand All @@ -171,7 +171,7 @@
"Replace the placeholders (e.g., [Your Azure Tenant ID]) with your actual values.\n",
"\n",
"+ `TENANT_ID`, `CLIENT_ID`, and `CLIENT_SECRET` from your register Application. [Detailed steps here](README.md)\n",
"- `SITE_DOMAIN` and `SITE_NAME` specify the SharePoint site from which data will be extracted.\n",
"- `SITE_HOSTNAME` and `SITE_NAME` specify the SharePoint site from which data will be extracted.\n",
"+ `SEARCH_SERVICE_ENDPOINT`, `SEARCH_INDEX_NAME`, and `SEARCH_ADMIN_API_KEY` are used to configure the Azure AI Search service.\n",
"\n",
"> 📌 **Note**\n",
Expand Down Expand Up @@ -376,7 +376,7 @@
"\n",
"# Get the Site ID for the specified SharePoint site\n",
"site_id = client_scrapping.get_site_id(\n",
" site_domain=os.environ[\"SITE_DOMAIN\"], site_name=os.environ[\"SITE_NAME\"]\n",
" site_hostname=os.environ[\"SITE_HOSTNAME\"], site_name=os.environ[\"SITE_NAME\"]\n",
")\n",
"\n",
"# Get the Drive ID associated with the Site ID\n",
Expand Down Expand Up @@ -432,7 +432,7 @@
"source": [
"# Download and process all `.docx` and `.pdf` files from a specific Site ID.\n",
"files_from_root_folder = client_scrapping.retrieve_sharepoint_files_content(\n",
" site_domain=os.environ[\"SITE_DOMAIN\"],\n",
" site_hostname=os.environ[\"SITE_HOSTNAME\"],\n",
" site_name=os.environ[\"SITE_NAME\"],\n",
" file_formats=[\"docx\", \"pdf\"],\n",
")"
Expand All @@ -459,7 +459,7 @@
"source": [
"# Download and process only `.docx` files from a specific SharePoint Site modified or uploaded in last 60 min.\n",
"files_from_root_folder_last_60_min = client_scrapping.retrieve_sharepoint_files_content(\n",
" site_domain=os.environ[\"SITE_DOMAIN\"],\n",
" site_hostname=os.environ[\"SITE_HOSTNAME\"],\n",
" site_name=os.environ[\"SITE_NAME\"],\n",
" file_formats=[\"docx\"],\n",
" minutes_ago=60,\n",
Expand Down Expand Up @@ -487,7 +487,7 @@
"source": [
"# Download and process files from a specific folder within a SharePoint site.\n",
"selected_files_content = client_scrapping.retrieve_sharepoint_files_content(\n",
" site_domain=os.environ[\"SITE_DOMAIN\"],\n",
" site_hostname=os.environ[\"SITE_HOSTNAME\"],\n",
" site_name=os.environ[\"SITE_NAME\"],\n",
" folder_path=\"/test/test2/test3/\",\n",
")"
Expand All @@ -513,7 +513,7 @@
"source": [
"# Download and process a specific file within a SharePoint site.\n",
"selected_file_content = client_scrapping.retrieve_sharepoint_files_content(\n",
" site_domain=os.environ[\"SITE_DOMAIN\"],\n",
" site_hostname=os.environ[\"SITE_HOSTNAME\"],\n",
" site_name=os.environ[\"SITE_NAME\"],\n",
" folder_path=\"/test/test2/test3/\",\n",
" file_names=[\"test3.docx\"],\n",
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ client_extractor.msgraph_auth()
```python

# Use the `retrieve_sharepoint_files_content` method to fetch file content and metadata from SharePoint
files = client_extractor.retrieve_sharepoint_files_content(site_domain=SITE_DOMAIN, site_name=SITE_NAME, folder_path="/test/test2/test3/", minutes_ago=60)
files = client_extractor.retrieve_sharepoint_files_content(site_hostname=SITE_HOSTNAME, site_name=SITE_NAME, folder_path="/test/test2/test3/", minutes_ago=60)

# The method returns a list of dictionaries, where each dictionary represents a file.
# Here's an example of the output `files`:
Expand Down Expand Up @@ -114,7 +114,7 @@ CLIENT_ID='[Your Azure Client ID]'
CLIENT_SECRET='[Your Azure Client Secret]'

# SharePoint Site Configuration
SITE_DOMAIN='[Your SharePoint Site Domain]'
SITE_HOSTNAME='[Your SharePoint Site Domain]'
SITE_NAME='[Your SharePoint Site Name]'

# Azure AI Search Service Configuration
Expand All @@ -126,7 +126,7 @@ SEARCH_ADMIN_API_KEY='[Your Azure Search Admin API Key]'
Replace the placeholders (e.g., [Your Azure Tenant ID]) with your actual values.

+ `TENANT_ID`, `CLIENT_ID`, and `CLIENT_SECRET` are used for authentication with Azure Active Directory.
- `SITE_DOMAIN` and `SITE_NAME` specify the SharePoint site from which data will be extracted.
- `SITE_HOSTNAME` and `SITE_NAME` specify the SharePoint site from which data will be extracted.
+ `SEARCH_SERVICE_ENDPOINT`, `SEARCH_INDEX_NAME`, and `SEARCH_ADMIN_API_KEY` are used to configure the Azure AI Search service.

> 📌 **Note**
Expand Down
39 changes: 31 additions & 8 deletions gbb_ai/sharepoint_data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,37 @@ def _make_ms_graph_request(
logger.error(f"Error in _make_ms_graph_request: {err}")
raise

def get_org_sites( self, access_token: Optional[str] = None) -> Optional[str]:
"""
Get a list of organization level sites from Microsoft Graph API.
"""
endpoint = (
f"https://graph.microsoft.com/v1.0/sites/"
)
access_token = access_token or self.access_token

try:
logger.info("Getting a list of organization sites...")
result = self._make_ms_graph_request(endpoint, access_token)
sites = result.get("value")

for site in sites:
site_id = site.get("id")
if site_id:
logger.info(f"Org site Id retrieved: {site_id}")
return site_id
except Exception as err:
logger.error(f"Error retrieving org sites: {err}")
return None

def get_site_id(
self, site_domain: str, site_name: str, access_token: Optional[str] = None
self, site_hostname: str, site_name: str, access_token: Optional[str] = None
) -> Optional[str]:
"""
Get the Site ID from Microsoft Graph API.
"""
endpoint = (
f"https://graph.microsoft.com/v1.0/sites/{site_domain}:/sites/{site_name}:/"
f"https://graph.microsoft.com/v1.0/sites/{site_hostname}:/sites/{site_name}:/"
)
access_token = access_token or self.access_token

Expand Down Expand Up @@ -504,7 +527,7 @@ def format_date(date_str):

def retrieve_sharepoint_files_content(
self,
site_domain: str,
site_hostname: str,
site_name: str,
folder_path: Optional[str] = None,
file_names: Optional[Union[str, List[str]]] = None,
Expand All @@ -514,7 +537,7 @@ def retrieve_sharepoint_files_content(
"""
Retrieve contents of files from a specified SharePoint location, optionally filtering by last modification time and file formats.

:param site_domain: The domain of the site in Microsoft Graph.
:param site_hostname: The domain of the site in Microsoft Graph.
:param site_name: The name of the site in Microsoft Graph.
:param folder_path: Path to the folder within the drive, can include subfolders like 'test1/test2'.
:param file_names: Optional; the name or names of specific files to retrieve. If provided, only these files' content will be fetched.
Expand All @@ -525,7 +548,7 @@ def retrieve_sharepoint_files_content(
if self._are_required_variables_missing():
return None

site_id, drive_id = self._get_site_and_drive_ids(site_domain, site_name)
site_id, drive_id = self._get_site_and_drive_ids(site_hostname, site_name)
if not site_id or not drive_id:
return None

Expand Down Expand Up @@ -566,16 +589,16 @@ def _are_required_variables_missing(self) -> bool:
return False

def _get_site_and_drive_ids(
self, site_domain: str, site_name: str
self, site_hostname: str, site_name: str
) -> (Optional[str], Optional[str]):
"""
Retrieves the site ID and drive ID for a given site domain and site name.

:param site_domain: The domain of the site.
:param site_hostname: The domain of the site.
:param site_name: The name of the site.
:return: A tuple containing the site ID and drive ID, or (None, None) if either ID could not be retrieved.
"""
site_id = self.get_site_id(site_domain, site_name)
site_id = self.get_site_id(site_hostname, site_name)
if not site_id:
logger.error("Failed to retrieve site_id")
return None, None
Expand Down
6 changes: 2 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@ requests>=2,<3
msal>=0.6.1,<2
python-docx
python-dotenv
#azure_search_documents==11.4.0b11
azure-search-documents==11.4.0b8
azure-search-documents==11.4.0b11
azure-ai-formrecognizer
openai==0.27.10
langchain
tiktoken
PyPDF2
openai==1.5.0
openai==1.7.0
tenacity
2 changes: 1 addition & 1 deletion vectors-01-create-index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.12.0"
}
},
"nbformat": 4,
Expand Down
13 changes: 7 additions & 6 deletions vectors-02-execute-indexing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"\n",
"# Define the target directory (change yours)\n",
"target_directory = (\n",
" r\"C:\\temp\\docker\\sharepoint-indexer\\sharepoint-indexing-azure-cognitive-search\"\n",
" r\"C:\\temp\\docker\\sharepoint-indexer\\sharepoint-indexing-azure-ai-search\"\n",
")\n",
"\n",
"# Check if the directory exists\n",
Expand Down Expand Up @@ -70,7 +70,7 @@
"openai.api_type = \"azure\" \n",
"openai.api_version = \"2023-05-15\"\n",
"\n",
"model = os.environ[\"OPEN_API_MODEL\"]\n",
"model = os.environ[\"OPEN_API_DEPLOYMENT_NAME\"]\n",
"\n",
"client = AzureOpenAI(\n",
" api_version=openai.api_version,\n",
Expand Down Expand Up @@ -127,8 +127,9 @@
"\n",
"# Get the Site ID for the specified SharePoint site\n",
"site_id = client_scrapping.get_site_id(\n",
" site_domain=os.environ[\"SITE_DOMAIN\"], site_name=os.environ[\"SITE_NAME\"]\n",
" site_hostname=os.environ[\"SITE_HOSTNAME\"], site_name=os.environ[\"SITE_NAME\"]\n",
")\n",
"print( site_id )\n",
"\n",
"# Get the Drive ID associated with the Site ID\n",
"drive_id = client_scrapping.get_drive_id(site_id)"
Expand Down Expand Up @@ -213,13 +214,13 @@
"\n",
" if folder == '/':\n",
" selected_files_content = client_scrapping.retrieve_sharepoint_files_content(\n",
" site_domain=os.environ[\"SITE_DOMAIN\"],\n",
" site_hostname=os.environ[\"SITE_HOSTNAME\"],\n",
" site_name=os.environ[\"SITE_NAME\"],\n",
" file_formats=[\"docx\", \"pdf\"],\n",
" )\n",
" else:\n",
" selected_files_content = client_scrapping.retrieve_sharepoint_files_content(\n",
" site_domain=os.environ[\"SITE_DOMAIN\"],\n",
" site_hostname=os.environ[\"SITE_HOSTNAME\"],\n",
" site_name=os.environ[\"SITE_NAME\"],\n",
" folder_path=folder,\n",
" file_formats=[\"docx\", \"pdf\"],\n",
Expand Down Expand Up @@ -290,7 +291,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.12.0"
}
},
"nbformat": 4,
Expand Down
4 changes: 2 additions & 2 deletions vectors-03-search-index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
"source": [
"# Use the Azure Cognitive Search SDK to perform a search on the index\n",
"# The search_text parameter is set to \"LLMs are the best\"\n",
"results = search_client.search(search_text=\"proposal\")\n",
"results = search_client.search(search_text=\"class\")\n",
"\n",
"# Iterate through the search results\n",
"for result in results:\n",
Expand Down Expand Up @@ -97,7 +97,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.12.0"
}
},
"nbformat": 4,
Expand Down