|
| 1 | +import os |
| 2 | +from azure.core.credentials import AzureKeyCredential |
| 3 | +from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient |
| 4 | +from azure.search.documents.indexes.models import ( |
| 5 | + SearchIndex, |
| 6 | + SimpleField, |
| 7 | + SearchableField, |
| 8 | + SearchFieldDataType, |
| 9 | + SearchIndexerDataSourceConnection, |
| 10 | + SearchIndexer, |
| 11 | + SearchIndexerDataContainer, |
| 12 | + BlobIndexerDataToExtract, |
| 13 | + BlobIndexerParsingMode, |
| 14 | + FieldMapping, |
| 15 | + FieldMappingFunction |
| 16 | +) |
| 17 | + |
| 18 | +# --- CONFIGURATION --- |
| 19 | +search_service_endpoint = "https://macae-search.search.windows.net" |
| 20 | +admin_key = "prm3QaLrqYIBeKAOO9GlcvMTOc6WlvVewKyHoTjXxhAzSeBLxz7K" |
| 21 | + |
| 22 | +storage_connection_string = "DefaultEndpointsProtocol=https;AccountName=rfpstorage1010;AccountKey=zPdUD9vPl8MNzs1HfOS0xoFxEJh+HKTfvqQvGTLacf24CmP83TbHT/lU5zvyDuxbeJH8Ryck3C96+AStGXstYA==;EndpointSuffix=core.windows.net" |
| 23 | +blob_container_name = "rfp-documents" |
| 24 | + |
| 25 | +data_source_name = "clm-rfp-blob-datasource" |
| 26 | +index_name = "clm-rfp-index" |
| 27 | +indexer_name = "clm-rfp-indexer" |
| 28 | + |
| 29 | +# --- Initialize Clients --- |
| 30 | +index_client = SearchIndexClient(endpoint=search_service_endpoint, credential=AzureKeyCredential(admin_key)) |
| 31 | +indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=AzureKeyCredential(admin_key)) |
| 32 | + |
| 33 | +# --- Define Index --- |
| 34 | +fields = [ |
| 35 | + SimpleField(name="id", type=SearchFieldDataType.String, key=True), |
| 36 | + SearchableField(name="section", type=SearchFieldDataType.String, filterable=True, sortable=True), |
| 37 | + SearchableField(name="text", type=SearchFieldDataType.String, analyzer_name="en.microsoft"), |
| 38 | + SimpleField(name="source_csv", type=SearchFieldDataType.String, filterable=True) |
| 39 | +] |
| 40 | + |
| 41 | +index = SearchIndex(name=index_name, fields=fields) |
| 42 | + |
| 43 | +try: |
| 44 | + index_client.create_index(index) |
| 45 | + print(f"✅ Created index: {index_name}") |
| 46 | +except Exception as e: |
| 47 | + if "already exists" in str(e): |
| 48 | + print(f"⚠️ Index already exists: {index_name}") |
| 49 | + else: |
| 50 | + print(f"❌ Failed to create index: {e}") |
| 51 | + |
| 52 | +# --- Define Data Source (Blob Storage) --- |
| 53 | +data_source = SearchIndexerDataSourceConnection( |
| 54 | + name=data_source_name, |
| 55 | + type="azureblob", |
| 56 | + connection_string=storage_connection_string, |
| 57 | + container=SearchIndexerDataContainer(name=blob_container_name), |
| 58 | + description="CLM RFP CSVs from Azure Blob Storage" |
| 59 | +) |
| 60 | + |
| 61 | +try: |
| 62 | + indexer_client.create_data_source_connection(data_source) |
| 63 | + print(f"✅ Created data source: {data_source_name}") |
| 64 | +except Exception as e: |
| 65 | + if "already exists" in str(e): |
| 66 | + print(f"⚠️ Data source already exists: {data_source_name}") |
| 67 | + else: |
| 68 | + print(f"❌ Failed to create data source: {e}") |
| 69 | + |
| 70 | +# --- Define Indexer (CSV parsing) --- |
| 71 | +indexer = SearchIndexer( |
| 72 | + name=indexer_name, |
| 73 | + description="Indexer for CLM RFP CSV data from Blob Storage", |
| 74 | + data_source_name=data_source_name, |
| 75 | + target_index_name=index_name, |
| 76 | + parameters={ |
| 77 | + "configuration": { |
| 78 | + "parsingMode": "delimitedText", |
| 79 | + "delimiter": ",", |
| 80 | + "firstLineContainsHeaders": True, |
| 81 | + "dataToExtract": "contentAndMetadata", |
| 82 | + "documentRoot": "/", |
| 83 | + "failOnUnsupportedContentType": False, |
| 84 | + "indexedFileNameExtensions": ".csv", |
| 85 | + "contentTypeDetection": "auto", # ✅ REQUIRED |
| 86 | + "detectEncodingFromByteOrderMarks": True, |
| 87 | + "encoding": "utf-8" |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +, |
| 92 | + field_mappings=[ |
| 93 | + FieldMapping(source_field_name="id", target_field_name="id"), |
| 94 | + FieldMapping(source_field_name="section", target_field_name="section"), |
| 95 | + FieldMapping(source_field_name="text", target_field_name="text"), |
| 96 | + FieldMapping(source_field_name="metadata_storage_name", target_field_name="source_csv") |
| 97 | + ] |
| 98 | +) |
| 99 | + |
| 100 | +try: |
| 101 | + indexer_client.create_indexer(indexer) |
| 102 | + print(f"✅ Created indexer: {indexer_name}") |
| 103 | +except Exception as e: |
| 104 | + if "already exists" in str(e): |
| 105 | + print(f"⚠️ Indexer already exists: {indexer_name}") |
| 106 | + else: |
| 107 | + print(f"❌ Failed to create indexer: {e}") |
| 108 | + |
| 109 | +# --- Run Indexer --- |
| 110 | +try: |
| 111 | + indexer_client.run_indexer(indexer_name) |
| 112 | + print("🚀 Indexer started successfully.") |
| 113 | +except Exception as e: |
| 114 | + print(f"❌ Failed to run indexer: {e}") |
0 commit comments