Skip to content

sample_rag_langchain.ipynb : "message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats." #45

@aantonellims

Description

@aantonellims

Hello,

I'm facing an issue :
I first used the sample_figure_understanding.ipynb notebook and get markdown files generated.
Then I used the sample_rag_langchain.ipynb to split and index my md file into an Azure Search, but facing the following issue :


HttpResponseError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_25944\1031862820.py in ?()
22
23 # Initiate Azure AI Document Intelligence to load the document
24 loader = AzureAIDocumentIntelligenceLoader(file_path=file_path, api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model="prebuilt-layout")
25
---> 26 docs = loader.load()
27
28 # Assuming each file contains a single document for simplicity
29 docs_string = docs[0].page_content

c:\Python311\Lib\site-packages\langchain_core\document_loaders\base.py in ?(self)
28 def load(self) -> List[Document]:
29 """Load data into Document objects."""
---> 30 return list(self.lazy_load())

c:\Python311\Lib\site-packages\langchain_community\document_loaders\doc_intelligence.py in ?(self)
92 ) -> Iterator[Document]:
93 """Lazy load given path as pages."""
94 if self.file_path is not None:
95 blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
---> 96 yield from self.parser.parse(blob)
97 else:
98 yield from self.parser.parse_url(self.url_path) # type: ignore[arg-type]

c:\Python311\Lib\site-packages\langchain_core\document_loaders\base.py in ?(self, blob)
122
123 Returns:
124 List of documents
125 """
--> 126 return list(self.lazy_parse(blob))

c:\Python311\Lib\site-packages\langchain_community\document_loaders\parsers\doc_intelligence.py in ?(self, blob)
76 def lazy_parse(self, blob: Blob) -> Iterator[Document]:
77 """Lazily parse the blob."""
78
79 with blob.as_bytes_io() as file_obj:
---> 80 poller = self.client.begin_analyze_document(
81 self.api_model,
82 file_obj,
83 content_type="application/octet-stream",

c:\Python311\Lib\site-packages\azure\core\tracing\decorator.py in ?(*args, **kwargs)
74 passed_in_parent = kwargs.pop("parent_span", None)
75
76 span_impl_type = settings.tracing_implementation()
77 if span_impl_type is None:
---> 78 return func(*args, **kwargs)
79
80 # Merge span is parameter is set, but only if no explicit parent are passed
81 if merge_span and not passed_in_parent:

c:\Python311\Lib\site-packages\azure\ai\documentintelligence_operations_operations.py in ?(self, model_id, analyze_request, pages, locale, string_index_type, features, query_fields, output_content_format, **kwargs)
3623 polling: Union[bool, PollingMethod] = kwargs.pop("polling", True)
3624 lro_delay = kwargs.pop("polling_interval", self._config.polling_interval)
3625 cont_token: Optional[str] = kwargs.pop("continuation_token", None)
3626 if cont_token is None:
-> 3627 raw_result = self._analyze_document_initial( # type: ignore
3628 model_id=model_id,
3629 analyze_request=analyze_request,
3630 pages=pages,

c:\Python311\Lib\site-packages\azure\ai\documentintelligence_operations_operations.py in ?(self, model_id, analyze_request, pages, locale, string_index_type, features, query_fields, output_content_format, **kwargs)
514 if _stream:
515 response.read() # Load the body in memory and close the socket
516 map_error(status_code=response.status_code, response=response, error_map=error_map)
517 error = _deserialize(_models.ErrorResponse, response.json())
--> 518 raise HttpResponseError(response=response, model=error)
519
520 response_headers = {}
521 response_headers["Retry-After"] = self._deserialize("int", response.headers.get("Retry-After"))

HttpResponseError: (InvalidRequest) Invalid request.
Code: InvalidRequest
Message: Invalid request.
Inner error: {
"code": "InvalidContent",
"message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats."
}

When I look the markdown file generated, I can see that titles (#) are represented by "==="

(I tried to manually make the change, but still facing the same issue, can anybody help ?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions