Azure AI Document Intelligence (formerly Form Recognizer) is Microsoft's document-AI service. It extracts printed and handwritten text, layout (lines, paragraphs, tables, figures), key-value pairs, and semantic fields from forms, invoices, receipts, IDs, and contracts. Unlike a pure OCR engine, it preserves document structure and provides prebuilt models for common document types — the Azure counterpart to AWS Textract.
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
endpoint="https://myco-docintel.cognitiveservices.azure.com/",
credential=AzureKeyCredential(""),
)
with open("contract.pdf", "rb") as f:
poller = client.begin_analyze_document(
"prebuilt-layout",
AnalyzeDocumentRequest(bytes_source=f.read()),
output_content_format=ContentFormat.MARKDOWN,
)
result = poller.result()
print(result.content[:2000]) # Markdown with headings, tables, etc.
poller = client.begin_analyze_document(
"prebuilt-invoice",
AnalyzeDocumentRequest(url_source="https://myco.blob.core.windows.net/inv/inv-0421.pdf"),
)
invoice = poller.result().documents[0]
print("Vendor:", invoice.fields["VendorName"].value_string)
print("Total :", invoice.fields["InvoiceTotal"].value_currency.amount)
for item in invoice.fields.get("Items", {}).value_array or []:
desc = item.value_object["Description"].value_string
amt = item.value_object["Amount"].value_currency.amount
print(f" - {desc}: {amt}")
poller = client.begin_analyze_document(
"prebuilt-layout",
AnalyzeDocumentRequest(url_source="https://myco.blob.core.windows.net/claims/claim-88.pdf"),
features=["queryFields"],
query_fields=["PolicyNumber", "DateOfIncident", "ClaimedAmount"],
)
result = poller.result()
for doc in result.documents:
for name, field in doc.fields.items():
print(name, "->", field.content)
The typical pattern for document-heavy RAG: Document Intelligence → chunks of Markdown → Azure OpenAI embeddings → Azure AI Search index.
# 1) Parse PDF to Markdown via Document Intelligence Layout
md = client.begin_analyze_document(
"prebuilt-layout",
AnalyzeDocumentRequest(bytes_source=pdf_bytes),
output_content_format=ContentFormat.MARKDOWN,
).result().content
# 2) Chunk by headings / token count (use langchain MarkdownHeaderTextSplitter, etc.)
chunks = chunk_markdown(md, max_tokens=500)
# 3) Embed chunks with Azure OpenAI
from openai import AzureOpenAI
aoai = AzureOpenAI(azure_endpoint="...", api_key="...", api_version="2024-10-21")
vectors = aoai.embeddings.create(model="embedding-3-large-prod", input=chunks).data
# 4) Upload to Azure AI Search (see the AI Search page for index schema)
documents = [{"id": f"doc-{i}", "content": c, "content_vector": v.embedding}
for i, (c, v) in enumerate(zip(chunks, vectors))]
search_client.upload_documents(documents)