diff --git a/gbb_ai/sharepoint_data_extractor.py b/gbb_ai/sharepoint_data_extractor.py index 44f21a3..2e92236 100644 --- a/gbb_ai/sharepoint_data_extractor.py +++ b/gbb_ai/sharepoint_data_extractor.py @@ -729,3 +729,57 @@ def _format_metadata( "read_access_entity": users_by_role, } return formatted_metadata + + def get_all_site_pages(self, site_id: str) -> List[Dict[str, Any]]: + """ + Retrieves all the site pages from a given SharePoint site. + + :param site_id: The site ID in Microsoft Graph. + :return: A list of dictionaries containing information about each page. + """ + url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/pages" + try: + pages = self._make_ms_graph_request(url) + return pages.get("value", []) + except Exception as err: + logger.error(f"Error retrieving site pages: {err}") + return [] + + def _get_page_content(self, site_id: str, page_id: str) -> Optional[Dict[str, Any]]: + """ + Retrieves the content of a specific site page using the page ID. + + :param site_id: The site ID in Microsoft Graph. + :param page_id: The ID of the page to retrieve content from. + :return: A dictionary containing the page content, including canvas layout. + """ + url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/pages/{page_id}/microsoft.graph.sitePage?$expand=canvasLayout" + try: + page_content = self._make_ms_graph_request(url) + return page_content + except Exception as err: + logger.error(f"Error retrieving page content: {err}") + return None + + def retrieve_and_process_site_pages(self, site_id: str) -> List[Dict[str, Any]]: + """ + Retrieves all site pages and processes each page's content. + + :param site_id: The site ID in Microsoft Graph. + :return: A list of processed pages with their content. + """ + all_pages = self.get_all_site_pages(site_id) + processed_pages = [] + + for page in all_pages: + page_id = page.get("id") + if page_id: + page_content = self._get_page_content(site_id, page_id) + if page_content: + # Here you can process the page content, e.g., chunking, etc. + processed_pages.append({ + "page_id": page_id, + "content": page_content + }) + + return processed_pages \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 030ece6..d27ed94 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,12 +2,12 @@ requests>=2,<3 msal>=0.6.1,<2 python-docx python-dotenv -#azure_search_documents==11.4.0b11 -azure-search-documents==11.4.0b8 +azure_search_documents==11.4.0b11 +#azure-search-documents==11.4.0b8 azure-ai-formrecognizer -openai==0.27.10 langchain tiktoken PyPDF2 openai==1.5.0 tenacity +bs4=0.0.2 \ No newline at end of file diff --git a/vectors-01-create-index.ipynb b/vectors-01-create-index.ipynb index d110e7c..3b47c87 100644 --- a/vectors-01-create-index.ipynb +++ b/vectors-01-create-index.ipynb @@ -18,22 +18,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import os\n", "import requests\n", "from azure.core.credentials import AzureKeyCredential \n", "from azure.search.documents import SearchClient \n", "from azure.search.documents.indexes import SearchIndexClient \n", - "from azure.search.documents.models import (\n", - " RawVectorQuery,\n", - ")\n", "from azure.search.documents.indexes.models import ( \n", " CorsOptions,\n", - " ExhaustiveKnnParameters, \n", - " ExhaustiveKnnVectorSearchAlgorithmConfiguration,\n", " HnswParameters, \n", " HnswVectorSearchAlgorithmConfiguration,\n", " SimpleField,\n", @@ -41,9 +47,9 @@ " ComplexField,\n", " SearchFieldDataType, \n", " SearchIndex, \n", - " VectorSearch, \n", - " VectorSearchAlgorithmKind, \n", - " VectorSearchProfile, \n", + " VectorSearch,\n", + " VectorSearchAlgorithmKind,\n", + " VectorSearchProfile,\n", ")\n", " \n", "\n", @@ -65,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -87,9 +93,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index sharepoint-site deleted\n" + ] + } + ], "source": [ "# Delete the index if it exists\n", "try:\n", @@ -101,9 +115,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index sharepoint-site created\n" + ] + } + ], "source": [ "# Create the index\n", "fields = [\n", @@ -182,13 +204,33 @@ " collection=True,\n", " fields=[SimpleField(name=\"list_item\", type=SearchFieldDataType.String, searchable=True, filterable=True,)],\n", " searchable=True),\n", - "\n", + " # Security field as collection of strings, filterable, not retrievable\n", + " SimpleField(\n", + " name=\"security\",\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.String),\n", + " filterable=True,\n", + " retrievable=False, # Ensures the field is not returned in search results\n", + " ),\n", + " # Allowed users field\n", + " SimpleField(\n", + " name=\"allowedUsers\",\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.String),\n", + " filterable=True,\n", + " retrievable=False, # Ensures this field is not returned in search results\n", + " ),\n", + " # Allowed groups field\n", + " SimpleField(\n", + " name=\"allowedGroups\",\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.String),\n", + " filterable=True,\n", + " retrievable=False, # Ensures this field is not returned in search results\n", + " ),\n", "]\n", + "\n", "cors_options = CorsOptions(allowed_origins=[\"*\"], max_age_in_seconds=60)\n", "scoring_profiles = []\n", "suggester = [{\"name\": \"sg\", \"source_fields\": [\"name\"]}]\n", "\n", - "\n", "# Configure the vector search configuration \n", "vector_search = VectorSearch( \n", " algorithms=[ \n", @@ -203,13 +245,13 @@ " ), \n", " )\n", " ], \n", - " profiles=[ \n", + " profiles=[ \n", " VectorSearchProfile( \n", " name=\"myHnswProfile\", \n", " algorithm=\"myHnsw\", \n", " ), \n", " ], \n", - ") \n", + ")\n", "\n", "index = SearchIndex(\n", " name=os.environ[\"SEARCH_INDEX_NAME\"],\n", @@ -226,13 +268,6 @@ "except Exception as ex:\n", " print(ex)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -251,7 +286,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.9.19" } }, "nbformat": 4, diff --git a/vectors-02-execute-indexing.ipynb b/vectors-02-execute-indexing.ipynb index 1906f99..55fe280 100644 --- a/vectors-02-execute-indexing.ipynb +++ b/vectors-02-execute-indexing.ipynb @@ -10,9 +10,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Directory changed to /Users/marcjimz/Documents/Development/sharepoint-indexing-azure-cognitive-search\n" + ] + } + ], "source": [ "import os\n", "import json\n", @@ -32,7 +40,7 @@ "\n", "# Define the target directory (change yours)\n", "target_directory = (\n", - " r\"C:\\temp\\docker\\sharepoint-indexer\\sharepoint-indexing-azure-cognitive-search\"\n", + " os.getcwd()\n", ")\n", "\n", "# Check if the directory exists\n", @@ -46,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -94,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -115,9 +123,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-09-03 22:11:03,595 - micro - MainProcess - INFO Successfully loaded environment variables: TENANT_ID, CLIENT_ID, CLIENT_SECRET (sharepoint_data_extractor.py:load_environment_variables_from_env_file:86)\n", + "2024-09-03 22:11:03,981 - micro - MainProcess - INFO New access token retrieved. (sharepoint_data_extractor.py:msgraph_auth:118)\n", + "2024-09-03 22:11:03,982 - micro - MainProcess - INFO Getting the Site ID... (sharepoint_data_extractor.py:get_site_id:187)\n", + "2024-09-03 22:11:04,318 - micro - MainProcess - INFO Site ID retrieved: 30z44s.sharepoint.com,4303930e-50c4-467a-ac6e-2128d74f3554,6828085b-3888-432a-baa6-225475f35b6b (sharepoint_data_extractor.py:get_site_id:191)\n", + "2024-09-03 22:11:04,738 - micro - MainProcess - INFO Successfully retrieved drive ID: b!DpMDQ8RQekasbiEo1081VFsIKGiIOCpDuqYiVHXzW2vabjOBwPZiQ4_E_CuTBjAI (sharepoint_data_extractor.py:get_drive_id:208)\n" + ] + } + ], "source": [ "# Load environment variables from the .env file\n", "client_scrapping.load_environment_variables_from_env_file()\n", @@ -136,7 +156,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -178,18 +198,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "# generate_embeddings('test')" + "#generate_embeddings('test')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Getting all folders in SharePoint site...\n" + ] + } + ], "source": [ "# Use the access token to get the folders \n", "print ('Getting all folders in SharePoint site...')\n", @@ -199,9 +227,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'https://graph.microsoft.com/v1.0/sites/30z44s.sharepoint.com,4303930e-50c4-467a-ac6e-2128d74f3554,6828085b-3888-432a-baa6-225475f35b6b/drive/root/children'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "root_url" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-09-03 22:11:05,217 - micro - MainProcess - INFO Getting the Site ID... (sharepoint_data_extractor.py:get_site_id:187)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing folder /...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-09-03 22:11:05,527 - micro - MainProcess - INFO Site ID retrieved: 30z44s.sharepoint.com,4303930e-50c4-467a-ac6e-2128d74f3554,6828085b-3888-432a-baa6-225475f35b6b (sharepoint_data_extractor.py:get_site_id:191)\n", + "2024-09-03 22:11:05,922 - micro - MainProcess - INFO Successfully retrieved drive ID: b!DpMDQ8RQekasbiEo1081VFsIKGiIOCpDuqYiVHXzW2vabjOBwPZiQ4_E_CuTBjAI (sharepoint_data_extractor.py:get_drive_id:208)\n", + "2024-09-03 22:11:05,923 - micro - MainProcess - INFO Making request to Microsoft Graph API (sharepoint_data_extractor.py:get_files_in_site:247)\n", + "2024-09-03 22:11:06,136 - micro - MainProcess - INFO Received response from Microsoft Graph API (sharepoint_data_extractor.py:get_files_in_site:250)\n", + "2024-09-03 22:11:06,137 - micro - MainProcess - ERROR No files found in the site's drive (sharepoint_data_extractor.py:retrieve_sharepoint_files_content:536)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No documents found in this folder\n", + "Upload of 0 documents complete.\n" + ] + } + ], "source": [ "# Download and process files from a set of folders within a SharePoint site.\n", "\n", @@ -266,12 +348,185 @@ "print (f\"Upload of {total_docs_uploaded} documents complete.\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ingesting Site Page Content\n", + "\n", + "Say you want to retrieve the SitePage content from a SharePoint site. You can use the `retrieve_sharepoint_site_pages_content` method to retrieve the SitePage content from a SharePoint site. The method returns a list of dictionaries, where each dictionary contains the content of a SitePage. This content is then passed through additional parses to generate the text content for our chunking strategy. \n", + "\n", + "This is only necessary if you have Site Pages that you want to index, and you should consider all chunking strategies to ensure that the content is indexed correctly." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Markdown functions to help extract text from horizontal and columnar canvas layouts:" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from typing import Any, Dict, List\n", + "from bs4 import BeautifulSoup\n", + "\n", + "def extract_text_from_webparts(webparts: List[Dict[str, Any]]) -> str:\n", + " \"\"\"\n", + " Extracts and concatenates text from a list of webparts, stripping all HTML tags.\n", + " \n", + " :param webparts: List of webparts that contain HTML content.\n", + " :return: A concatenated string of text with HTML tags removed.\n", + " \"\"\"\n", + " text_content = \"\"\n", + " for webpart in webparts:\n", + " if \"innerHtml\" in webpart:\n", + " html_content = webpart.get(\"innerHtml\", \"\")\n", + " # Use BeautifulSoup to strip HTML tags\n", + " soup = BeautifulSoup(html_content, 'html.parser')\n", + " text_content += soup.get_text(separator=' ', strip=True) + \" \"\n", + " \n", + " return text_content.strip()\n", + "\n", + "def extract_text_from_canvas_layout(canvas_layout: Dict[str, Any]) -> str:\n", + " \"\"\"\n", + " Extracts all text from the canvasLayout by iterating through horizontalSections, columns, and webparts.\n", + " \n", + " :param canvas_layout: The canvasLayout object from a SharePoint page.\n", + " :return: A concatenated string of all text extracted from the layout, with HTML tags removed.\n", + " \"\"\"\n", + " text_content = \"\"\n", + "\n", + " # Iterate over horizontal sections\n", + " horizontal_sections = canvas_layout.get(\"horizontalSections\", [])\n", + " for section in horizontal_sections:\n", + " section_id = section.get(\"id\")\n", + " # print(f\"Processing section {section_id} with layout {section.get('layout')}...\")\n", + "\n", + " # Iterate over columns in each horizontal section\n", + " columns = section.get(\"columns\", [])\n", + " for column in columns:\n", + " column_id = column.get(\"id\")\n", + " # print(f\"Processing column {column_id} in section {section_id}...\")\n", + "\n", + " # Extract text from each column's webParts\n", + " webparts = column.get(\"webparts\", [])\n", + " text_content += extract_text_from_webparts(webparts) + \" \"\n", + "\n", + " return text_content.strip()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Execution" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 4 pages from the site...\n", + "[Page 1/4] Processing Page ID 41a9a211-512f-4841-a66b-744e685ac95c...\n", + "[Page 1/4] Extracted text length: 1957 characters.\n", + "[Page 1/4] Split into 1 chunks.\n", + "[Page 1/4][Chunk 1/1] Processing chunk of size 1957 characters.\n", + "[Page 2/4] Processing Page ID 9883ffad-80a7-4038-899a-542aa6f77e39...\n", + "[Page 2/4] Page ID 9883ffad-80a7-4038-899a-542aa6f77e39: No text extracted from canvas layout.\n", + "[Page 3/4] Processing Page ID 6a8b2d03-d26c-44fc-972a-c52cb449aeb1...\n", + "[Page 3/4] Extracted text length: 4050 characters.\n", + "[Page 3/4] Split into 1 chunks.\n", + "[Page 3/4][Chunk 1/1] Processing chunk of size 4050 characters.\n", + "[Page 4/4] Processing Page ID 2955533b-8c34-44db-b23b-e90fda2f80e5...\n", + "[Page 4/4] Page ID 2955533b-8c34-44db-b23b-e90fda2f80e5: No text extracted from canvas layout.\n", + "Total Documents ready for upload: 2\n", + "Uploading batch of 2 documents...\n", + "Error during multiple documents upload: 'IndexingResult' object is not subscriptable\n", + "Total Documents Uploaded: 2.\n" + ] + } + ], + "source": [ + "total_docs_uploaded = 0\n", + "\n", + "# Retrieve and process site pages\n", + "processed_pages_content = client_scrapping.retrieve_and_process_site_pages(site_id)\n", + "\n", + "if not processed_pages_content:\n", + " print(\"No pages found in the site.\")\n", + "else:\n", + " chunked_content_docs = []\n", + " \n", + " print(f\"Processing {len(processed_pages_content)} pages from the site...\")\n", + "\n", + " # Iterate through the processed pages content\n", + " for page_num, page in enumerate(processed_pages_content, start=1):\n", + " page_id = page.get(\"page_id\")\n", + " canvas_layout = page.get(\"content\", {}).get(\"canvasLayout\", {})\n", + "\n", + " if not canvas_layout:\n", + " print(f\"[Page {page_num}/{len(processed_pages_content)}] Page ID {page_id}: No content found.\")\n", + " continue\n", + " \n", + " print(f\"[Page {page_num}/{len(processed_pages_content)}] Processing Page ID {page_id}...\")\n", + "\n", + " # Extract all text from the canvasLayout, removing HTML\n", + " page_text_content = extract_text_from_canvas_layout(canvas_layout)\n", + "\n", + " if not page_text_content:\n", + " print(f\"[Page {page_num}/{len(processed_pages_content)}] Page ID {page_id}: No text extracted from canvas layout.\")\n", + " continue\n", + " \n", + " print(f\"[Page {page_num}/{len(processed_pages_content)}] Extracted text length: {len(page_text_content)} characters.\")\n", + " \n", + " # Now chunk the text content - you can bring your own text splitter here and chunk accordingly!\n", + " chunked_content = text_splitter.split_text(page_text_content)\n", + " print(f\"[Page {page_num}/{len(processed_pages_content)}] Split into {len(chunked_content)} chunks.\")\n", + " \n", + " chunk_counter = 0\n", + "\n", + " # Iterate through the chunks and create the chunked content docs\n", + " for chunk in chunked_content:\n", + " print(f\"[Page {page_num}/{len(processed_pages_content)}][Chunk {chunk_counter + 1}/{len(chunked_content)}] Processing chunk of size {len(chunk)} characters.\")\n", + "\n", + " json_data = {\n", + " \"id\": page_id + \"-\" + str(chunk_counter), # Create a unique chunk ID\n", + " \"content\": chunk, # Chunked content\n", + " \"contentVector\": generate_embeddings(chunk), # Embeddings for the chunk\n", + " \"doc_id\": page_id, # Original page ID\n", + " \"chunk_id\": chunk_counter # Chunk counter\n", + " }\n", + " chunked_content_docs.append(json_data)\n", + " chunk_counter += 1\n", + "\n", + " # Calculate total documents to upload\n", + " total_docs = len(chunked_content_docs)\n", + " total_docs_uploaded += total_docs\n", + " print(f\"Total Documents ready for upload: {total_docs}\")\n", + "\n", + " # Upload the documents in chunks\n", + " for documents_chunk in divide_chunks(chunked_content_docs, n):\n", + " try:\n", + " print(f\"Uploading batch of {len(documents_chunk)} documents...\")\n", + " result = search_client.upload_documents(documents=documents_chunk)\n", + " # Print the result for each document\n", + " for res in result:\n", + " print(f\"Upload of document {res['key']} succeeded: {res['succeeded']}\")\n", + " except Exception as ex:\n", + " print(f\"Error during multiple documents upload: {ex}\")\n", + "\n", + "print(f\"Total Documents Uploaded: {total_docs_uploaded}.\")" + ] } ], "metadata": { @@ -290,7 +545,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.9.19" } }, "nbformat": 4,