@@ -260,10 +260,10 @@ def data_index_acs_pipeline(
260
260
input_data : Input ,
261
261
embeddings_model : str ,
262
262
acs_config : str ,
263
- acs_connection_id : Optional [ str ] ,
264
- aoai_connection_id : Optional [ str ] ,
263
+ acs_connection_id : str ,
264
+ aoai_connection_id : str ,
265
265
embeddings_container : Input ,
266
- chunk_size : Optional [ int ] = 768 ,
266
+ chunk_size : int = 768 ,
267
267
chunk_overlap : Optional [int ] = 0 ,
268
268
input_glob : Optional [str ] = "**/*" ,
269
269
citation_url : Optional [str ] = None ,
@@ -279,28 +279,26 @@ def data_index_acs_pipeline(
279
279
:param acs_config: The configuration for the Azure Cognitive Search index.
280
280
:type acs_config: str
281
281
:param acs_connection_id: The connection ID for the Azure Cognitive Search index.
282
- :type acs_connection_id: Optional[ str]
282
+ :type acs_connection_id: str
283
283
:param chunk_size: The size of the chunks to break the input data into.
284
- :type chunk_size: Optional[ int]
284
+ :type chunk_size: int
285
285
:param chunk_overlap: The number of tokens to overlap between chunks.
286
286
:type chunk_overlap: Optional[int]
287
287
:param input_glob: The glob pattern to use when searching for input data.
288
- :type input_glob: Optional[str]s
288
+ :type input_glob: Optional[str]
289
289
:param citation_url: The URL to use when generating citations for the input data.
290
290
:type citation_url: str
291
291
:param citation_replacement_regex: The regex to use when generating citations for the input data.
292
292
:type citation_replacement_regex: str
293
293
:param aoai_connection_id: The connection ID for the Azure Open AI service.
294
- :type aoai_connection_id: Optional[ str]
294
+ :type aoai_connection_id: str
295
295
:param embeddings_container: The container to use when caching embeddings.
296
296
:type embeddings_container: Input
297
297
:return: The URI of the generated Azure Cognitive Search index.
298
298
:rtype: str.
299
299
"""
300
300
if input_glob is None :
301
301
input_glob = "**/*"
302
- if chunk_size is None :
303
- chunk_size = 768
304
302
if chunk_overlap is None :
305
303
chunk_overlap = 0
306
304
@@ -361,7 +359,7 @@ def data_index_acs_pipeline(
361
359
component = data_index_acs_pipeline (
362
360
input_data = input_data ,
363
361
input_glob = data_index .source .input_glob ,
364
- chunk_size = data_index .source .chunk_size ,
362
+ chunk_size = data_index .source .chunk_size , # type: ignore[arg-type]
365
363
chunk_overlap = data_index .source .chunk_overlap ,
366
364
citation_url = data_index .source .citation_url ,
367
365
citation_replacement_regex = json .dumps (data_index .source .citation_url_replacement_regex ._to_dict ())
@@ -422,11 +420,11 @@ def data_index_faiss_pipeline(
422
420
input_data : Input ,
423
421
embeddings_model : str ,
424
422
embeddings_container : Input ,
425
- chunk_size : Optional [ int ] = 1024 ,
426
- data_source_glob : Optional [ str ] = None ,
427
- data_source_url : Optional [ str ] = None ,
428
- document_path_replacement_regex : Optional [ str ] = None ,
429
- aoai_connection_id : Optional [ str ] = None ,
423
+ chunk_size : int = 1024 ,
424
+ data_source_glob : str = None , # type: ignore[assignment]
425
+ data_source_url : str = None , # type: ignore[assignment]
426
+ document_path_replacement_regex : str = None , # type: ignore[assignment]
427
+ aoai_connection_id : str = None , # type: ignore[assignment]
430
428
):
431
429
"""
432
430
Generate embeddings for a `input_data` source and create a Faiss index from them.
@@ -508,10 +506,10 @@ def data_index_faiss_pipeline(
508
506
component = data_index_faiss_pipeline (
509
507
input_data = input_data ,
510
508
embeddings_model = build_model_protocol (data_index .embedding .model ),
511
- chunk_size = data_index .source .chunk_size ,
512
- data_source_glob = data_index .source .input_glob ,
513
- data_source_url = data_index .source .citation_url ,
514
- document_path_replacement_regex = json .dumps (data_index .source .citation_url_replacement_regex ._to_dict ())
509
+ chunk_size = data_index .source .chunk_size , # type: ignore[arg-type]
510
+ data_source_glob = data_index .source .input_glob , # type: ignore[arg-type]
511
+ data_source_url = data_index .source .citation_url , # type: ignore[arg-type]
512
+ document_path_replacement_regex = json .dumps (data_index .source .citation_url_replacement_regex ._to_dict ()) # type: ignore[arg-type]
515
513
if data_index .source .citation_url_replacement_regex
516
514
else None ,
517
515
aoai_connection_id = _resolve_connection_id (ml_client , data_index .embedding .connection ),
@@ -563,13 +561,13 @@ def data_index_acs_pipeline(
563
561
input_data : Input ,
564
562
embeddings_model : str ,
565
563
acs_config : str ,
566
- acs_connection_id : Optional [ str ] ,
564
+ acs_connection_id : str ,
567
565
embeddings_container : Input ,
568
- chunk_size : Optional [ int ] = 1024 ,
569
- data_source_glob : Optional [ str ] = None ,
570
- data_source_url : Optional [ str ] = None ,
571
- document_path_replacement_regex : Optional [ str ] = None ,
572
- aoai_connection_id : Optional [ str ] = None ,
566
+ chunk_size : int = 1024 ,
567
+ data_source_glob : str = None , # type: ignore[assignment]
568
+ data_source_url : str = None , # type: ignore[assignment]
569
+ document_path_replacement_regex : str = None , # type: ignore[assignment]
570
+ aoai_connection_id : str = None , # type: ignore[assignment]
573
571
):
574
572
"""
575
573
Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index.
@@ -666,10 +664,10 @@ def data_index_acs_pipeline(
666
664
embeddings_model = build_model_protocol (data_index .embedding .model ),
667
665
acs_config = json .dumps (acs_config ),
668
666
acs_connection_id = _resolve_connection_id (ml_client , data_index .index .connection ),
669
- chunk_size = data_index .source .chunk_size ,
670
- data_source_glob = data_index .source .input_glob ,
671
- data_source_url = data_index .source .citation_url ,
672
- document_path_replacement_regex = json .dumps (data_index .source .citation_url_replacement_regex ._to_dict ())
667
+ chunk_size = data_index .source .chunk_size , # type: ignore[arg-type]
668
+ data_source_glob = data_index .source .input_glob , # type: ignore[arg-type]
669
+ data_source_url = data_index .source .citation_url , # type: ignore[arg-type]
670
+ document_path_replacement_regex = json .dumps (data_index .source .citation_url_replacement_regex ._to_dict ()) # type: ignore[arg-type]
673
671
if data_index .source .citation_url_replacement_regex
674
672
else None ,
675
673
aoai_connection_id = _resolve_connection_id (ml_client , data_index .embedding .connection ),
@@ -754,9 +752,9 @@ def get_component_obj(ml_client, component_uri):
754
752
return component_obj
755
753
756
754
757
- def _resolve_connection_id (ml_client , connection : Optional [Union [str , WorkspaceConnection ]] = None ) -> Optional [ str ] :
755
+ def _resolve_connection_id (ml_client , connection : Optional [Union [str , WorkspaceConnection ]] = None ) -> str :
758
756
if connection is None :
759
- return None
757
+ return ""
760
758
761
759
if isinstance (connection , str ):
762
760
short_form = re .match (r"azureml:(?P<connection_name>[^/]*)" , connection )
0 commit comments