From 9a3cca2928e25e96e92a7fb712f51cb13d20b3af Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 19:24:15 +1200 Subject: [PATCH 1/6] feat: add client.parse() for the Data Extraction API (/extraction/parse) Adds first-class support for the Data Extraction API on NutrientClient. Covers all four processing modes (text, structure, understand, agentic) and both output shapes (spatial elements and whole-document Markdown). The response surface is a fully typed ParseResponse TypedDict with a discriminated union of element variants (paragraph, table, formula, picture, keyValueRegion, handwriting) so callers can narrow on `type`. The Data Extraction API is billed against extraction credits, which are a separate billing bucket from the processor API credits consumed by the other endpoints used by this client (Build, sign, OCR, watermarking, etc.). Docstrings, README, and changelog make that distinction explicit so callers do not conflate the two buckets. Verification: - 16 new unit tests in tests/unit/test_parse.py (request shape per mode, response parsing, error propagation for 401 / 400 / 402 / 500). - mypy strict and ruff clean on src/. Endpoint surface (httpx-multipart): POST /extraction/parse with a 'file' part and an optional 'instructions' part carrying the JSON {mode, output:{format}} body. Extends the existing send_request infra (RequestConfig + TypeGuard + overload) without churn to existing endpoint paths. --- CHANGELOG.md | 17 ++ README.md | 50 +++++ src/nutrient_dws/__init__.py | 34 +++ src/nutrient_dws/client.py | 100 +++++++++ src/nutrient_dws/http.py | 46 +++- src/nutrient_dws/types/parse.py | 342 ++++++++++++++++++++++++++++ tests/unit/test_parse.py | 387 ++++++++++++++++++++++++++++++++ 7 files changed, 975 insertions(+), 1 deletion(-) create mode 100644 src/nutrient_dws/types/parse.py create mode 100644 tests/unit/test_parse.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e6fcf67..66987b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- `client.parse()` — first-class support for the Data Extraction API + (`/extraction/parse`). Supports all four processing modes (`text`, + `structure`, `understand`, `agentic`) and both output shapes (spatial + elements and whole-document Markdown). Typed response model with + discriminated element variants (paragraph, table, formula, picture, + keyValueRegion, handwriting). Billed against **extraction credits**, a + separate billing bucket from the **processor API credits** used by the + other endpoints. +- New types exported from `nutrient_dws`: `ParseResponse`, + `ParseInstructions`, `ParseMode`, `ParseOutputFormat`, `ParseElement`, + `ParseOutputBody`, `ParseOutputElements`, `ParseOutputMarkdown`, + `ParagraphElement`, `TableElement`, `TableCell`, `FormulaElement`, + `PictureElement`, `KeyValueRegionElement`, `KeyValuePair`, + `HandwritingElement`. + ## [3.0.0] - 2026-01-30 ### Security diff --git a/README.md b/README.md index edb7b8c..bc6795b 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,56 @@ asyncio.run(main()) For a complete list of available methods with examples, see the [Methods Documentation](docs/METHODS.md). +## Data Extraction (`/extraction/parse`) + +`client.parse()` calls the Data Extraction API to extract structured content from +a document. It supports four processing modes and two output shapes: + +```python +import asyncio +from nutrient_dws import NutrientClient + +async def main(): + client = NutrientClient(api_key='your_api_key') + + # Spatial elements (default) — paragraphs, tables, formulas, pictures, etc. + response = await client.parse('contract.pdf', mode='understand') + for element in response['output']['elements']: + if element['type'] == 'table': + print(element['rowCount'], element['columnCount']) + + # Whole-document Markdown from a born-digital PDF + response = await client.parse( + 'report.pdf', mode='text', output_format='markdown', + ) + print(response['output']['markdown']) + +asyncio.run(main()) +``` + +### Modes and credit cost + +| Mode | Extraction credits / page | When to use | +|--------------|---------------------------|------------------------------------------------------------------| +| `text` | 1 | Fast Markdown from born-digital documents. No OCR or AI. | +| `structure` | 1.5 | OCR-based spatial extraction with bounding boxes. | +| `understand` | 9 | AI-augmented layout analysis, tables, formulas, classification. | +| `agentic` | 18 | VLM-augmented; deepest visual understanding. | + +### Billing — extraction credits vs processor credits + +The Data Extraction API is billed against **extraction credits**, which are a +separate billing bucket from the **processor API credits** consumed by +`/build`, `/sign`, OCR, and the other Processor API endpoints used by this +client (`convert`, `watermark_text`, `merge`, etc.). The response surfaces the +extraction-credit accounting under `response['usage']['data_extraction_credits']`: + +```python +usage = response['usage']['data_extraction_credits'] +print(f"Cost: {usage['cost']} extraction credits, " + f"remaining: {usage['remainingCredits']}") +``` + ## Workflow System The client also provides a fluent builder pattern with staged interfaces to create document processing workflows: diff --git a/src/nutrient_dws/__init__.py b/src/nutrient_dws/__init__.py index f14eb6c..1e23613 100644 --- a/src/nutrient_dws/__init__.py +++ b/src/nutrient_dws/__init__.py @@ -19,16 +19,50 @@ process_file_input, validate_file_input, ) +from nutrient_dws.types.parse import ( + FormulaElement, + HandwritingElement, + KeyValuePair, + KeyValueRegionElement, + ParagraphElement, + ParseElement, + ParseInstructions, + ParseMode, + ParseOutputBody, + ParseOutputElements, + ParseOutputFormat, + ParseOutputMarkdown, + ParseResponse, + PictureElement, + TableCell, + TableElement, +) from nutrient_dws.utils import get_library_version, get_user_agent __all__ = [ "APIError", "AuthenticationError", "FileInput", + "FormulaElement", + "HandwritingElement", + "KeyValuePair", + "KeyValueRegionElement", "LocalFileInput", "NetworkError", "NutrientClient", "NutrientError", + "ParagraphElement", + "ParseElement", + "ParseInstructions", + "ParseMode", + "ParseOutputBody", + "ParseOutputElements", + "ParseOutputFormat", + "ParseOutputMarkdown", + "ParseResponse", + "PictureElement", + "TableCell", + "TableElement", "UrlFileInput", "ValidationError", "get_library_version", diff --git a/src/nutrient_dws/client.py b/src/nutrient_dws/client.py index 670878b..4c5a5da 100644 --- a/src/nutrient_dws/client.py +++ b/src/nutrient_dws/client.py @@ -18,6 +18,7 @@ from nutrient_dws.errors import NutrientError, ValidationError from nutrient_dws.http import ( NutrientClientOptions, + ParseRequestData, RedactRequestData, RequestConfig, SignRequestData, @@ -54,6 +55,13 @@ CreateAuthTokenResponse, ) from nutrient_dws.types.misc import OcrLanguage, PageRange, Pages +from nutrient_dws.types.parse import ( + ParseInstructions, + ParseMode, + ParseOutput, + ParseOutputFormat, + ParseResponse, +) from nutrient_dws.types.redact_data import RedactOptions from nutrient_dws.types.sign_request import CreateDigitalSignature @@ -753,6 +761,98 @@ async def extract_key_value_pairs( return cast("JsonContentOutput", self._process_typed_workflow_result(result)) + async def parse( + self, + file: LocalFileInput, + mode: ParseMode = "structure", + output_format: ParseOutputFormat = "spatial", + ) -> ParseResponse: + """Parse a document using the Data Extraction API (`/extraction/parse`). + + The Data Extraction API is billed against **extraction credits**, which + are a separate billing bucket from the **processor API credits** + consumed by `/build`, `/sign`, OCR, and other Processor API endpoints. + + Per-page extraction-credit costs by mode: + + - `text`: 1 extraction credit / page — fast Markdown extraction from + born-digital documents (no OCR or AI). + - `structure`: 1.5 extraction credits / page — OCR-based spatial + extraction with bounding boxes (default). + - `understand`: 9 extraction credits / page — AI-augmented layout + analysis, table detection, and semantic classification. + - `agentic`: 18 extraction credits / page — VLM-augmented extraction + building on `understand` mode. + + Output format selects the shape under `response.output`: + + - `spatial` (default): `output.elements` — typed elements (paragraph, + table, formula, picture, keyValueRegion, handwriting) with bounds, + confidence, and reading order. + - `markdown`: `output.markdown` — a whole-document Markdown string, + well suited for RAG / search indexing pipelines. + + **Security note**: this method only accepts local files (paths, bytes, + file objects) because the underlying API surface for this endpoint is + multipart-only. For remote inputs, fetch them client-side with + appropriate URL validation first. + + Args: + file: The document to parse (local files only — paths, bytes, or + file-like objects). + mode: Processing mode. See per-mode credit costs above. Defaults + to `"structure"`. + output_format: Output shape — `"spatial"` for typed elements or + `"markdown"` for a Markdown document. Defaults to + `"spatial"`. + + Returns: + The full parse response envelope, including `output`, `metrics`, + `usage` (the extraction-credit accounting), and `configuration`. + + Example: + ```python + # Spatial elements with full layout analysis (9 extraction credits / page) + response = await client.parse('contract.pdf', mode='understand') + for element in response['output']['elements']: + if element['type'] == 'table': + print(element['rowCount'], element['columnCount']) + + # Whole-document Markdown from a born-digital PDF (1 extraction credit / page) + response = await client.parse( + 'report.pdf', mode='text', output_format='markdown' + ) + print(response['output']['markdown']) + + # Inspect billing + usage = response['usage']['data_extraction_credits'] + print(f"Cost: {usage['cost']} extraction credits " + f"(remaining: {usage['remainingCredits']})") + ``` + """ + # Multipart-only endpoint; only local file inputs are supported. + normalized_file = await process_file_input(file) + + instructions: ParseInstructions = { + "mode": mode, + "output": cast("ParseOutput", {"format": output_format}), + } + + request_data: ParseRequestData = { + "file": normalized_file, + "instructions": instructions, + } + + config = RequestConfig( + method="POST", + endpoint="/extraction/parse", + data=request_data, + headers=None, + ) + + response: Any = await send_request(config, self.options) + return cast("ParseResponse", response["data"]) + async def set_page_labels( self, pdf: FileInput, diff --git a/src/nutrient_dws/http.py b/src/nutrient_dws/http.py index ee92942..ab55550 100644 --- a/src/nutrient_dws/http.py +++ b/src/nutrient_dws/http.py @@ -23,6 +23,7 @@ CreateAuthTokenParameters, CreateAuthTokenResponse, ) +from nutrient_dws.types.parse import ParseInstructions, ParseResponse from nutrient_dws.types.redact_data import RedactData from nutrient_dws.types.sign_request import CreateDigitalSignature from nutrient_dws.utils import get_user_agent @@ -37,6 +38,11 @@ class AnalyzeBuildRequestData(TypedDict): instructions: BuildInstructions +class ParseRequestData(TypedDict): + file: NormalizedFileData + instructions: NotRequired[ParseInstructions] + + class SignRequestOptions(TypedDict): image: NotRequired[LocalFileInput] graphicImage: NotRequired[LocalFileInput] @@ -64,7 +70,13 @@ class DeleteTokenRequestData(TypedDict): Endpoint = TypeVar( "Endpoint", bound=Literal[ - "/account/info", "/build", "/analyze_build", "/sign", "/ai/redact", "/tokens" + "/account/info", + "/build", + "/analyze_build", + "/sign", + "/ai/redact", + "/tokens", + "/extraction/parse", ], ) @@ -77,6 +89,7 @@ class DeleteTokenRequestData(TypedDict): | SignRequestData | RedactRequestData | DeleteTokenRequestData + | ParseRequestData | None, ) Output = TypeVar( @@ -87,6 +100,7 @@ class DeleteTokenRequestData(TypedDict): | BuildResponseJsonContents | AnalyzeBuildResponse | AccountInfo + | ParseResponse | None, ) @@ -151,6 +165,14 @@ def is_delete_tokens_request_config( return request["method"] == "DELETE" and request["endpoint"] == "/tokens" +def is_post_extraction_parse_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[ + RequestConfig[Literal["POST"], Literal["/extraction/parse"], ParseRequestData] +]: + return request["method"] == "POST" and request["endpoint"] == "/extraction/parse" + + # API response class ApiResponse(TypedDict, Generic[Output]): """Response from API call.""" @@ -301,6 +323,19 @@ def prepare_request_body( return request_config + if is_post_extraction_parse_request_config(config): + # multipart/form-data: 'file' part + optional 'instructions' part (JSON string) + files = {} + append_file_to_form_data(files, "file", config["data"]["file"]) + request_config["files"] = files + + if "instructions" in config["data"]: + request_config["data"] = { + "instructions": json.dumps(config["data"]["instructions"]) + } + + return request_config + # Fallback, passing data as JSON if "data" in config: request_config["json"] = config["data"] @@ -557,6 +592,15 @@ async def send_request( ) -> ApiResponse[None]: ... +@overload +async def send_request( + config: RequestConfig[ + Literal["POST"], Literal["/extraction/parse"], ParseRequestData + ], + client_options: NutrientClientOptions, +) -> ApiResponse[ParseResponse]: ... + + async def send_request( config: RequestConfig[Method, Endpoint, Input], client_options: NutrientClientOptions, diff --git a/src/nutrient_dws/types/parse.py b/src/nutrient_dws/types/parse.py new file mode 100644 index 0000000..2f0b95d --- /dev/null +++ b/src/nutrient_dws/types/parse.py @@ -0,0 +1,342 @@ +"""Type definitions for the data-extraction `/extraction/parse` endpoint. + +These TypedDicts describe the multipart request payload (`ParseInstructions`) +and the JSON response envelope (`ParseResponse`) returned by `POST +/extraction/parse`. + +Billing note: `/extraction/parse` is billed against **extraction credits**, +which are a separate bucket from the **processor API credits** consumed by +endpoints such as `/build`, `/sign`, and OCR. The credit costs documented +below (1, 1.5, 9, 18 per page) are extraction credits. +""" + +from typing import Literal + +from typing_extensions import TypedDict + +# ---- Request types -------------------------------------------------------- + +ParseMode = Literal["text", "structure", "understand", "agentic"] +"""Processing mode for `/extraction/parse`. + +| Mode | Extraction credits / page | Notes | +|-------------|---------------------------|------------------------------------------------------------| +| `text` | 1 | Fast Markdown extraction from born-digital documents. | +| `structure` | 1.5 | OCR + spatial elements with bounding boxes. | +| `understand`| 9 | AI-augmented: tables, formulas, semantic classification. | +| `agentic` | 18 | VLM-augmented; deepest visual understanding. | +""" + +ParseOutputFormat = Literal["markdown", "spatial"] +"""Output format requested from `/extraction/parse`. + +- `markdown`: a whole-document Markdown string at `response.output.markdown`. +- `spatial`: a list of typed elements with bounds at `response.output.elements`. +""" + + +class ParseOutput(TypedDict, total=False): + """The `output` sub-object of `ParseInstructions`.""" + + format: ParseOutputFormat + + +class ParseInstructions(TypedDict, total=False): + """Request body sent as the `instructions` multipart field. + + Both fields are optional on the wire; the server defaults are applied if + omitted. The client surfaces these as keyword arguments on + `NutrientClient.parse()` so the typical caller does not construct this + dict directly. + """ + + mode: ParseMode + output: ParseOutput + + +# ---- Response types ------------------------------------------------------- + + +class ParseBounds(TypedDict): + """Axis-aligned bounding box, in the page's coordinate space.""" + + x: float + y: float + width: float + height: float + + +class ParsePageRef(TypedDict, total=False): + """Reference to the page an element was extracted from. + + `pageIndex` and dimensions are always populated; `pageNumber` carries the + page's visible label (e.g. `"1"`, `"iv"`) and may be absent if the + document does not declare one. + """ + + pageIndex: int + pageNumber: str + width: float + height: float + + +class ParseWord(TypedDict, total=False): + """A single OCR'd word from a `paragraph` or `handwriting` element.""" + + text: str + bounds: ParseBounds + confidence: float + + +# Element variants (discriminated on `type`) ------------------------------- + +ParagraphRole = Literal[ + "Text", + "Title", + "SectionHeader", + "Header", + "Footer", + "Caption", + "Footnote", + "ListItem", + "PageNumber", + "Code", + "CheckboxSelected", + "CheckboxUnselected", +] + + +class _ElementBase(TypedDict, total=False): + """Fields shared by every element variant. + + Kept private; the public element types below extend the relevant subset + explicitly so that `type` is a literal on each variant (enabling + discriminated-union narrowing). + """ + + id: str + bounds: ParseBounds + confidence: float + readingOrder: int + page: ParsePageRef + + +class ParagraphElement(_ElementBase, total=False): + """A paragraph (or other text-bearing block) of extracted text.""" + + type: Literal["paragraph"] + text: str + role: ParagraphRole + words: list[ParseWord] | None + + +class HandwritingElement(_ElementBase, total=False): + """Handwritten text extracted by `understand` / `agentic` modes.""" + + type: Literal["handwriting"] + text: str + words: list[ParseWord] | None + + +class FormulaElement(_ElementBase, total=False): + """A mathematical formula, expressed in LaTeX.""" + + type: Literal["formula"] + latex: str + + +PictureClassification = Literal[ + "chart", + "diagram", + "logo", + "photo", + "screenshot", + "signature", + "other", +] + + +class PictureElement(_ElementBase, total=False): + """An image, chart, diagram, or other non-text region.""" + + type: Literal["picture"] + classification: PictureClassification + classificationConfidence: float + altDescription: str + captionIds: list[str] + footnoteIds: list[str] + + +class TableCell(TypedDict, total=False): + """One cell of a `TableElement`.""" + + id: str + bounds: ParseBounds + confidence: float + row: int + column: int + rowSpan: int + colSpan: int + text: str + + +class TableElement(_ElementBase, total=False): + """A tabular region with cell-level extraction.""" + + type: Literal["table"] + rowCount: int + columnCount: int + cells: list[TableCell] | None + captionIds: list[str] + footnoteIds: list[str] + + +KeyValueEntityType = Literal["QUESTION", "ANSWER", "HEADER", "OTHER"] + + +class KeyValueEntity(TypedDict, total=False): + """One side (key or value) of a `KeyValuePair`.""" + + id: str + bounds: ParseBounds + confidence: float + entityType: KeyValueEntityType + value: str + + +class KeyValuePair(TypedDict, total=False): + """A linked key/value pair within a `KeyValueRegionElement`.""" + + id: str + key: KeyValueEntity + value: KeyValueEntity + relationshipConfidence: float + + +class KeyValueRegionElement(_ElementBase, total=False): + """A form-field region whose contents pair keys to values.""" + + type: Literal["keyValueRegion"] + pairs: list[KeyValuePair] + + +ParseElement = ( + ParagraphElement + | HandwritingElement + | FormulaElement + | PictureElement + | TableElement + | KeyValueRegionElement +) +"""Discriminated union of every element variant the parse endpoint returns. + +Narrow by reading the `type` literal, e.g.: + +```python +for element in response["output"]["elements"]: + if element["type"] == "table": + # element is now narrowed to TableElement + print(element["rowCount"]) +``` +""" + + +# Output shapes ------------------------------------------------------------ + + +class ParseOutputMarkdown(TypedDict): + """`response.output` when `output.format == "markdown"`.""" + + markdown: str + + +class ParseOutputElements(TypedDict): + """`response.output` when `output.format == "spatial"`.""" + + elements: list[ParseElement] + + +ParseOutputBody = ParseOutputMarkdown | ParseOutputElements +"""Discriminated by the requested output format: markdown vs elements. + +In practice callers know which format they requested, so the response can be +narrowed by inspecting which key is present. +""" + + +# Envelope sub-objects ----------------------------------------------------- + + +class ParseMetrics(TypedDict, total=False): + """Per-request processing metrics.""" + + processingTimeMs: int + pagesProcessed: int + + +class ParseConfiguration(TypedDict, total=False): + """Server-reported configuration the request was executed under. + + `mode` echoes the requested `ParseMode`. `outputFormat` echoes the + requested `ParseOutputFormat` ("markdown" or "spatial"). + """ + + mode: ParseMode + outputFormat: ParseOutputFormat + + +class ParseExtractionCredits(TypedDict, total=False): + """Credit accounting for this request. + + `cost` is in **extraction credits** (NOT processor API credits) and + `remainingCredits` is the remaining balance in the same bucket. + """ + + cost: float + remainingCredits: float + + +class ParseUsage(TypedDict, total=False): + """Wraps the extraction-credit accounting under its wire key. + + The server uses the snake_case key `data_extraction_credits` here even + though every other field in the response is camelCase; the TypedDict + mirrors the wire format verbatim. + """ + + data_extraction_credits: ParseExtractionCredits + + +class ParseFailingPath(TypedDict, total=False): + """One entry of `errorDetails.failingPaths` on a 4xx error response.""" + + path: str + details: str + + +class ParseErrorDetails(TypedDict, total=False): + """Structured server-side error details, when present.""" + + source: str + code: str + failingPaths: list[ParseFailingPath] + + +class ParseResponse(TypedDict, total=False): + """Top-level response envelope from `POST /extraction/parse`. + + On a successful 200 response, `status == 200`, `output` is populated, and + `metrics` / `usage` / `configuration` are present. Error responses (4xx + / 5xx) reuse the same envelope but populate `errorMessage` + + `errorDetails` instead of `output`; the client raises before returning + them, so the response object the user sees is always a success. + """ + + status: int + requestId: str + output: ParseOutputBody + metrics: ParseMetrics + usage: ParseUsage + configuration: ParseConfiguration + errorMessage: str + errorDetails: ParseErrorDetails diff --git a/tests/unit/test_parse.py b/tests/unit/test_parse.py new file mode 100644 index 0000000..cf3f218 --- /dev/null +++ b/tests/unit/test_parse.py @@ -0,0 +1,387 @@ +"""Unit tests for `NutrientClient.parse()`. + +These tests stub `send_request` so they exercise the request-shape and +response-handling logic of `parse()` without making a real HTTP call. The +live smoke check is `examples/src/smoke_parse.py`. +""" + +import json +from typing import TYPE_CHECKING, Any, cast +from unittest.mock import AsyncMock, patch + +import pytest + +from nutrient_dws import ( + APIError, + AuthenticationError, + NutrientClient, + ValidationError, +) +from nutrient_dws.errors import NutrientError +from nutrient_dws.http import prepare_request_body + +if TYPE_CHECKING: + from nutrient_dws import ParseResponse, TableElement + from nutrient_dws.types.parse import ParseOutputElements, ParseOutputMarkdown + + +def _make_response(payload: dict[str, Any]) -> dict[str, Any]: + return { + "data": payload, + "status": 200, + "statusText": "OK", + "headers": {}, + } + + +@pytest.fixture +def parse_client() -> NutrientClient: + return NutrientClient(api_key="pdf_test_unit", base_url="https://api.test.example") + + +class TestParseRequestShape: + """Verify the request the client constructs against `/extraction/parse`.""" + + @pytest.mark.asyncio + async def test_default_mode_and_output( + self, parse_client: NutrientClient, tmp_path + ) -> None: + pdf = tmp_path / "sample.pdf" + pdf.write_bytes(b"%PDF-1.7\n%minimal") + + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.return_value = _make_response( + { + "status": 200, + "requestId": "req_default", + "output": {"elements": []}, + } + ) + + await parse_client.parse(pdf) + + sent_config = send.call_args[0][0] + + assert sent_config["method"] == "POST" + assert sent_config["endpoint"] == "/extraction/parse" + instructions = sent_config["data"]["instructions"] + assert instructions == {"mode": "structure", "output": {"format": "spatial"}} + # file field is a (bytes, filename) tuple + file_part = sent_config["data"]["file"] + assert isinstance(file_part[0], bytes) + assert file_part[1] == "sample.pdf" + + @pytest.mark.asyncio + @pytest.mark.parametrize( + ("mode", "output_format"), + [ + ("text", "markdown"), + ("structure", "spatial"), + ("understand", "spatial"), + ("agentic", "spatial"), + ("understand", "markdown"), + ], + ) + async def test_mode_and_output_combinations( + self, + parse_client: NutrientClient, + tmp_path, + mode: str, + output_format: str, + ) -> None: + pdf = tmp_path / "doc.pdf" + pdf.write_bytes(b"%PDF-1.7\nmini") + + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.return_value = _make_response( + {"status": 200, "requestId": "r", "output": {"elements": []}} + ) + + await parse_client.parse(pdf, mode=mode, output_format=output_format) + + sent_config = send.call_args[0][0] + + assert sent_config["data"]["instructions"] == { + "mode": mode, + "output": {"format": output_format}, + } + + @pytest.mark.asyncio + async def test_accepts_bytes_input(self, parse_client: NutrientClient) -> None: + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.return_value = _make_response( + {"status": 200, "requestId": "r", "output": {"markdown": "# Hi"}} + ) + + await parse_client.parse( + b"%PDF-1.7\nbytes", mode="text", output_format="markdown" + ) + + sent_config = send.call_args[0][0] + + file_part = sent_config["data"]["file"] + assert file_part[0] == b"%PDF-1.7\nbytes" + # Anonymous-bytes inputs land with the conventional "document" filename + assert file_part[1] == "document" + + def test_prepare_request_body_serializes_instructions(self) -> None: + """`prepare_request_body` must emit `instructions` as a JSON string in + the multipart form, alongside the file in `files`. + """ + request_config: dict[str, Any] = {} + config = { + "method": "POST", + "endpoint": "/extraction/parse", + "data": { + "file": (b"%PDF-1.7", "doc.pdf"), + "instructions": { + "mode": "agentic", + "output": {"format": "spatial"}, + }, + }, + "headers": None, + } + + prepared = prepare_request_body(request_config, config) # type: ignore[arg-type] + + assert "files" in prepared + assert "file" in prepared["files"] + # Multipart `data` carries the JSON-stringified instructions + assert json.loads(prepared["data"]["instructions"]) == { + "mode": "agentic", + "output": {"format": "spatial"}, + } + + def test_prepare_request_body_omits_instructions_when_absent(self) -> None: + request_config: dict[str, Any] = {} + config = { + "method": "POST", + "endpoint": "/extraction/parse", + "data": {"file": (b"%PDF-1.7", "doc.pdf")}, + "headers": None, + } + + prepared = prepare_request_body(request_config, config) # type: ignore[arg-type] + + assert "files" in prepared + # When instructions are omitted, no multipart `data` field is sent + assert "data" not in prepared + + +class TestParseResponseHandling: + """Verify the client returns the raw response envelope to the caller.""" + + @pytest.mark.asyncio + async def test_returns_markdown_envelope( + self, parse_client: NutrientClient + ) -> None: + payload: ParseResponse = { + "status": 200, + "requestId": "req_md", + "output": {"markdown": "# Title\n\nBody."}, + "metrics": {"processingTimeMs": 312, "pagesProcessed": 1}, + "usage": { + "data_extraction_credits": {"cost": 1, "remainingCredits": 850}, + }, + "configuration": {"mode": "text", "outputFormat": "markdown"}, + } + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.return_value = _make_response(dict(payload)) + + response = await parse_client.parse( + b"%PDF-1.7\nmini", mode="text", output_format="markdown" + ) + + assert response["status"] == 200 + assert response["requestId"] == "req_md" + markdown_out = cast("ParseOutputMarkdown", response["output"]) + assert markdown_out["markdown"].startswith("# Title") + assert response["usage"]["data_extraction_credits"]["cost"] == 1 + + @pytest.mark.asyncio + async def test_returns_spatial_envelope_with_discriminated_elements( + self, parse_client: NutrientClient + ) -> None: + payload = { + "status": 200, + "requestId": "req_sp", + "output": { + "elements": [ + { + "id": "e1", + "type": "paragraph", + "text": "Hello", + "role": "Text", + "confidence": 0.95, + "readingOrder": 0, + "bounds": {"x": 0, "y": 0, "width": 50, "height": 10}, + "page": { + "pageIndex": 0, + "pageNumber": "1", + "width": 612, + "height": 792, + }, + }, + { + "id": "e2", + "type": "table", + "rowCount": 2, + "columnCount": 2, + "cells": [ + { + "id": "c1", + "row": 0, + "column": 0, + "rowSpan": 1, + "colSpan": 1, + "text": "h1", + } + ], + "confidence": 0.9, + "readingOrder": 1, + "bounds": {"x": 0, "y": 20, "width": 100, "height": 50}, + "page": { + "pageIndex": 0, + "pageNumber": "1", + "width": 612, + "height": 792, + }, + }, + ], + }, + "metrics": {"processingTimeMs": 4200, "pagesProcessed": 1}, + "usage": { + "data_extraction_credits": {"cost": 9, "remainingCredits": 991}, + }, + "configuration": {"mode": "understand", "outputFormat": "spatial"}, + } + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.return_value = _make_response(payload) + + response = await parse_client.parse( + b"%PDF-1.7\nmini", mode="understand" + ) + + spatial_out = cast("ParseOutputElements", response["output"]) + elements = spatial_out["elements"] + assert len(elements) == 2 + # Discriminated narrowing on the `type` literal + first, second = elements[0], elements[1] + assert first["type"] == "paragraph" + assert second["type"] == "table" + # second is narrowed to TableElement by the discriminator check above + table: TableElement = second # type: ignore[assignment] + assert table["rowCount"] == 2 + + @pytest.mark.asyncio + async def test_full_extraction_credit_accounting_surface( + self, parse_client: NutrientClient + ) -> None: + """The client must surface the wire's snake_case `data_extraction_credits` + key verbatim — it's the operator's primary signal that the request was + billed against extraction credits, not processor credits. + """ + payload = { + "status": 200, + "requestId": "r", + "output": {"markdown": "x"}, + "usage": { + "data_extraction_credits": {"cost": 18, "remainingCredits": 100}, + }, + } + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.return_value = _make_response(payload) + + response = await parse_client.parse(b"%PDF-1.7", mode="agentic") + + usage = response["usage"]["data_extraction_credits"] + assert usage["cost"] == 18 + assert usage["remainingCredits"] == 100 + + +class TestParseErrorPaths: + """`send_request` raises the same `NutrientError` hierarchy as every other + endpoint; we just verify the errors propagate out of `parse()` unchanged. + """ + + @pytest.mark.asyncio + async def test_authentication_error_propagates( + self, parse_client: NutrientClient + ) -> None: + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.side_effect = AuthenticationError( + "Missing, invalid, or expired API token", + {"requestId": "req_e_401"}, + 401, + ) + + with pytest.raises(AuthenticationError) as exc_info: + await parse_client.parse(b"%PDF-1.7", mode="text") + + assert exc_info.value.status_code == 401 + assert (exc_info.value.details or {}).get("requestId") == "req_e_401" + + @pytest.mark.asyncio + async def test_validation_error_propagates( + self, parse_client: NutrientClient + ) -> None: + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.side_effect = ValidationError( + "The request is malformed", + { + "requestId": "req_e_400", + "errorDetails": { + "source": "request", + "code": "invalid_request", + "failingPaths": [ + {"path": "$.mode", "details": "invalid mode: 'turbo'"} + ], + }, + }, + 400, + ) + + with pytest.raises(ValidationError) as exc_info: + await parse_client.parse( + b"%PDF-1.7", mode="text" # mode is fine; server-side fail + ) + + details = exc_info.value.details or {} + failing = details.get("errorDetails", {}).get("failingPaths", []) + assert failing and failing[0]["path"] == "$.mode" + + @pytest.mark.asyncio + async def test_payment_required_propagates_as_api_error( + self, parse_client: NutrientClient + ) -> None: + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.side_effect = APIError( + "Insufficient credits. This request requires 18 credits, 0 remaining.", + 402, + {"requestId": "req_e_402"}, + ) + + with pytest.raises(APIError) as exc_info: + await parse_client.parse(b"%PDF-1.7", mode="agentic") + + assert exc_info.value.status_code == 402 + assert "Insufficient credits" in str(exc_info.value) + + @pytest.mark.asyncio + async def test_server_error_propagates( + self, parse_client: NutrientClient + ) -> None: + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.side_effect = APIError( + "Processing failed. Please retry or contact support with the requestId.", + 500, + { + "requestId": "req_e_500", + "errorDetails": {"source": "maestro", "code": "maestro_error"}, + }, + ) + + with pytest.raises(NutrientError) as exc_info: + await parse_client.parse(b"%PDF-1.7", mode="structure") + + assert exc_info.value.status_code == 500 From 90040d66cd87c72ebb0a9b7509f9fc0936c6e032 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 16:25:29 +1200 Subject: [PATCH 2/6] refactor(types): extract ExtractionCredits to dedicated module The extraction-credits accounting shape (cost + remainingCredits) will surface on every future endpoint billed against the extraction-credits bucket, not just /extraction/parse. Factor it out of types/parse.py into its own module so other endpoints can import it without pulling in the whole parse type tree. Also clarify ParseBounds: document that (x, y) is the top-left corner and that bounds share a coordinate space with the page dimensions in ParsePageRef. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/nutrient_dws/__init__.py | 2 ++ src/nutrient_dws/types/extraction_credits.py | 21 ++++++++++++++ src/nutrient_dws/types/parse.py | 29 ++++++++++---------- tests/unit/test_parse.py | 3 +- 4 files changed, 39 insertions(+), 16 deletions(-) create mode 100644 src/nutrient_dws/types/extraction_credits.py diff --git a/src/nutrient_dws/__init__.py b/src/nutrient_dws/__init__.py index 1e23613..6b259f4 100644 --- a/src/nutrient_dws/__init__.py +++ b/src/nutrient_dws/__init__.py @@ -19,6 +19,7 @@ process_file_input, validate_file_input, ) +from nutrient_dws.types.extraction_credits import ExtractionCredits from nutrient_dws.types.parse import ( FormulaElement, HandwritingElement, @@ -42,6 +43,7 @@ __all__ = [ "APIError", "AuthenticationError", + "ExtractionCredits", "FileInput", "FormulaElement", "HandwritingElement", diff --git a/src/nutrient_dws/types/extraction_credits.py b/src/nutrient_dws/types/extraction_credits.py new file mode 100644 index 0000000..0c53f6d --- /dev/null +++ b/src/nutrient_dws/types/extraction_credits.py @@ -0,0 +1,21 @@ +"""Shared types for the DWS **extraction credits** billing bucket. + +Extraction credits are billed separately from the processor API credits +consumed by `/build`, `/sign`, OCR, etc. The types in this module are +intentionally endpoint-agnostic so they can be reused by any future +endpoint that surfaces extraction-credit accounting in its response. +""" + +from typing import TypedDict + + +class ExtractionCredits(TypedDict, total=False): + """Credit accounting for one request against the extraction-credits bucket. + + `cost` is the number of **extraction credits** debited by the request + (NOT processor API credits). `remainingCredits` is the post-debit + balance in the same bucket. + """ + + cost: float + remainingCredits: float diff --git a/src/nutrient_dws/types/parse.py b/src/nutrient_dws/types/parse.py index 2f0b95d..bf5df61 100644 --- a/src/nutrient_dws/types/parse.py +++ b/src/nutrient_dws/types/parse.py @@ -14,6 +14,8 @@ from typing_extensions import TypedDict +from nutrient_dws.types.extraction_credits import ExtractionCredits + # ---- Request types -------------------------------------------------------- ParseMode = Literal["text", "structure", "understand", "agentic"] @@ -58,7 +60,14 @@ class ParseInstructions(TypedDict, total=False): class ParseBounds(TypedDict): - """Axis-aligned bounding box, in the page's coordinate space.""" + """Axis-aligned bounding box on the page. + + `(x, y)` is the **top-left corner** of the box. The page's coordinate + origin is the top-left, with `x` increasing to the right and `y` + increasing downward. Units are pixels in the same canvas described by + `ParsePageRef.width` and `ParsePageRef.height` (i.e. element bounds and + the page dimensions share one coordinate space). + """ x: float y: float @@ -285,26 +294,18 @@ class ParseConfiguration(TypedDict, total=False): outputFormat: ParseOutputFormat -class ParseExtractionCredits(TypedDict, total=False): - """Credit accounting for this request. - - `cost` is in **extraction credits** (NOT processor API credits) and - `remainingCredits` is the remaining balance in the same bucket. - """ - - cost: float - remainingCredits: float - - class ParseUsage(TypedDict, total=False): """Wraps the extraction-credit accounting under its wire key. The server uses the snake_case key `data_extraction_credits` here even though every other field in the response is camelCase; the TypedDict - mirrors the wire format verbatim. + mirrors the wire format verbatim. The inner `ExtractionCredits` type + lives in `nutrient_dws.types.extraction_credits` because the same + credit-accounting shape will surface on future endpoints that bill + against the extraction-credits bucket. """ - data_extraction_credits: ParseExtractionCredits + data_extraction_credits: ExtractionCredits class ParseFailingPath(TypedDict, total=False): diff --git a/tests/unit/test_parse.py b/tests/unit/test_parse.py index cf3f218..d669722 100644 --- a/tests/unit/test_parse.py +++ b/tests/unit/test_parse.py @@ -1,8 +1,7 @@ """Unit tests for `NutrientClient.parse()`. These tests stub `send_request` so they exercise the request-shape and -response-handling logic of `parse()` without making a real HTTP call. The -live smoke check is `examples/src/smoke_parse.py`. +response-handling logic of `parse()` without making a real HTTP call. """ import json From c8d6e77391bf9f0ca7761e52343733691991cb36 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 16:25:54 +1200 Subject: [PATCH 3/6] refactor(client): align parse() style with the rest of the file Three small style nits surfaced in code review against the patterns set by sign() and the other raw-send_request methods (get_account_info, create_token, delete_token): - Drop the redundant inner cast("ParseOutput", {"format": output_format}). ParseOutput is a single-key TypedDict with total=False; the literal already satisfies it structurally via the surrounding ParseInstructions annotation. No other call site in client.py casts an inner literal this way. - Replace the RequestConfig(...) constructor call with an inline dict literal at the send_request boundary, matching sign / create_token / delete_token / get_account_info. RequestConfig is a generic TypedDict; the constructor form is the outlier. - Broaden the file parameter docstring to call out that the endpoint accepts PDFs, Office documents, and images. Unlike sign(), parsing is not PDF-only, and the previous docstring implicitly invited readers to transplant sign()'s PDF-only mental model. No behavior change. format) combinations. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/nutrient_dws/client.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/nutrient_dws/client.py b/src/nutrient_dws/client.py index 4c5a5da..e85ccd8 100644 --- a/src/nutrient_dws/client.py +++ b/src/nutrient_dws/client.py @@ -58,7 +58,6 @@ from nutrient_dws.types.parse import ( ParseInstructions, ParseMode, - ParseOutput, ParseOutputFormat, ParseResponse, ) @@ -799,7 +798,10 @@ async def parse( Args: file: The document to parse (local files only — paths, bytes, or - file-like objects). + file-like objects). The endpoint accepts a range of document + formats (PDF, Office documents, images); see the public + guide for the authoritative list. Unlike `sign()`, parsing + is not restricted to PDFs. mode: Processing mode. See per-mode credit costs above. Defaults to `"structure"`. output_format: Output shape — `"spatial"` for typed elements or @@ -835,7 +837,7 @@ async def parse( instructions: ParseInstructions = { "mode": mode, - "output": cast("ParseOutput", {"format": output_format}), + "output": {"format": output_format}, } request_data: ParseRequestData = { @@ -843,14 +845,15 @@ async def parse( "instructions": instructions, } - config = RequestConfig( - method="POST", - endpoint="/extraction/parse", - data=request_data, - headers=None, + response: Any = await send_request( + { + "method": "POST", + "endpoint": "/extraction/parse", + "data": request_data, + "headers": None, + }, + self.options, ) - - response: Any = await send_request(config, self.options) return cast("ParseResponse", response["data"]) async def set_page_labels( From ba5fb0d0d3978e05df2b8687aef34c86af7f4d0e Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 19:39:44 +1200 Subject: [PATCH 4/6] docs: explain what client.parse() is good for MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The README's Data Extraction section previously described WHAT parse() does (modes, output formats, billing) without explaining WHY a user would reach for it over the existing extract_* helpers. Rework so the positioning leads: - New "designed for" bullets up top — RAG ingestion, search indexing, content migration, form/invoice extraction, layout-aware document understanding. - New output-format selector table mapping each format to its primary use case (markdown → RAG/search; spatial → form/layout). - Modes table reworded so each row says when to pick it, not just what it technically does (text = born-digital only; structure = OCR for scanned input; understand = AI-augmented for complex layouts; agentic = + VLM for image-heavy content). - Two worked recipes: RAG ingestion (PDF → markdown → embed) and form extraction (PDF → spatial elements → structured dict). Also adds a parse() entry to docs/METHODS.md (it was missing entirely) and a "Designed for" preamble to the parse() docstring so the method's positioning is visible in IDE hover popups. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 79 +++++++++++++++++++++++++++++++++----- docs/METHODS.md | 46 ++++++++++++++++++++++ src/nutrient_dws/client.py | 16 ++++++++ 3 files changed, 132 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index bc6795b..0bb21c3 100644 --- a/README.md +++ b/README.md @@ -90,8 +90,31 @@ For a complete list of available methods with examples, see the [Methods Documen ## Data Extraction (`/extraction/parse`) -`client.parse()` calls the Data Extraction API to extract structured content from -a document. It supports four processing modes and two output shapes: +`client.parse()` exposes Nutrient's Data Extraction API. It's designed for +**content-extraction workflows** where you need to feed document content into a +downstream pipeline rather than render or transform the document itself: + +- **RAG (retrieval-augmented generation) pipelines** — pull a clean Markdown + representation of a document for chunking, embedding, and indexing in a + vector store. +- **Search indexing and content migration** — convert documents into Markdown + for full-text search or for migration into a new content management system. +- **Form and invoice extraction** — pull structured fields (key/value pairs, + tables, semantic regions) out of business documents with bounding boxes and + confidence scores attached to every element. +- **Layout-aware document understanding** — get a typed, page-anchored element + list (paragraphs with semantic roles, tables with cell spans, formulas in + LaTeX, pictures, handwriting) suitable for building document-comprehension + tooling, including agentic workflows. + +### Choosing an output format + +| Format | Best for | Shape | +|-------------------|----------------------------------------------------------------------------|----------------------------------------------------------------------| +| `markdown` | RAG, search indexing, content migration — anywhere structured text beats spatial data | One whole-document Markdown string at `response['output']['markdown']` | +| `spatial` (default) | Form/invoice extraction, layout reconstruction, flows that need per-element confidence | Flat list of typed elements at `response['output']['elements']` | + +### Quick start ```python import asyncio @@ -115,14 +138,52 @@ async def main(): asyncio.run(main()) ``` -### Modes and credit cost +### Modes — when to use which -| Mode | Extraction credits / page | When to use | -|--------------|---------------------------|------------------------------------------------------------------| -| `text` | 1 | Fast Markdown from born-digital documents. No OCR or AI. | -| `structure` | 1.5 | OCR-based spatial extraction with bounding boxes. | -| `understand` | 9 | AI-augmented layout analysis, tables, formulas, classification. | -| `agentic` | 18 | VLM-augmented; deepest visual understanding. | +| Mode | Credits / page | When to use | +|--------------|----------------|----------------------------------------------------------------------------------------------| +| `text` | 1 | Born-digital documents only. No OCR, no AI. Fastest and cheapest path to Markdown. | +| `structure` | 1.5 | OCR-based segmentation with bounding boxes. Handles scanned documents, images, and any input requiring OCR. | +| `understand` | 9 | Full pipeline with AI augmentation on top of OCR. Most accurate for documents with tables, multi-column layouts, formulas, and form fields. | +| `agentic` | 18 | Builds on `understand` and adds a vision-language model. Best for image descriptions, complex visual layouts, and deeper semantic understanding. | + +### Recipes + +**RAG ingestion** — PDF → Markdown → chunks → embeddings → vector store: + +```python +response = await client.parse('whitepaper.pdf', mode='text', output_format='markdown') +markdown = response['output']['markdown'] +# Then: chunk on headings, embed, push to your vector store of choice. +``` + +For born-digital PDFs, `mode='text'` is the cheapest path (1 credit/page). +For scanned PDFs or images, switch to `mode='structure'` so OCR runs. + +**Form/invoice extraction** — PDF → spatial elements → structured dict: + +```python +response = await client.parse('invoice.pdf', mode='understand') +elements = response['output']['elements'] + +# Pull key/value pairs from form regions +fields = {} +for element in elements: + if element['type'] == 'keyValueRegion': + for pair in element['pairs']: + fields[pair['key']['value']] = pair['value']['value'] + +# Walk tables — each cell carries row/col indices and span counts +for element in elements: + if element['type'] == 'table': + print(f"Table: {element['rowCount']}×{element['columnCount']}") + for cell in element['cells']: + print(f" [{cell['row']}][{cell['column']}] {cell['text']}") +``` + +For complex layouts that mix dense images with text, step up to +`mode='agentic'` so the VLM can produce image descriptions and semantic +classifications (18 credits/page). ### Billing — extraction credits vs processor credits diff --git a/docs/METHODS.md b/docs/METHODS.md index 5873a9b..8586f55 100644 --- a/docs/METHODS.md +++ b/docs/METHODS.md @@ -449,6 +449,52 @@ if kvps and len(kvps) > 0: print(f'Total Amount: {dictionary.get("Total")}') ``` +##### parse(file, mode?, output_format?) +Calls the Data Extraction API (`/extraction/parse`) to extract structured +content from a document. Designed for **RAG ingestion**, **search indexing**, +**content migration**, and **form/invoice extraction** workflows where the +goal is to feed document content into a downstream pipeline rather than +render or transform the document itself. + +Billed against **extraction credits** — a separate billing bucket from the +processor API credits consumed by every other method on this client. See the +[README's Data Extraction section](../README.md#data-extraction-extractionparse) +for the full positioning, the per-mode comparison, and worked recipes. + +**Parameters**: +- `file: LocalFileInput` - The document to parse. The endpoint accepts PDFs, + Office documents, and images. Only local inputs (paths, bytes, file-like + objects) are supported — URLs are not, because the underlying API surface is + multipart-only. +- `mode: ParseMode` - `"text"` (1 credit/page, born-digital only, no OCR/AI), + `"structure"` (1.5 credits/page, OCR + spatial layout — default), + `"understand"` (9 credits/page, AI-augmented), or `"agentic"` (18 credits/page, + adds a vision-language model). +- `output_format: ParseOutputFormat` - `"spatial"` (default — typed elements + with bounds and confidence at `response['output']['elements']`) or + `"markdown"` (whole-document Markdown string at `response['output']['markdown']`). + +**Returns**: `ParseResponse` - The full response envelope, including `output`, +`metrics`, `configuration`, and `usage['data_extraction_credits']` (cost and +remaining balance in the extraction-credits bucket). + +```python +# RAG ingestion — born-digital PDF to Markdown, cheap and fast. +response = await client.parse('whitepaper.pdf', mode='text', output_format='markdown') +markdown = response['output']['markdown'] + +# Form extraction — typed spatial elements with bounds and confidence. +response = await client.parse('invoice.pdf', mode='understand') +for element in response['output']['elements']: + if element['type'] == 'keyValueRegion': + for pair in element['pairs']: + print(pair['key']['value'], '→', pair['value']['value']) + +# Inspect billing — cost is in extraction credits, not processor credits. +usage = response['usage']['data_extraction_credits'] +print(f"Cost: {usage['cost']} extraction credits, remaining: {usage['remainingCredits']}") +``` + ##### flatten(file, annotation_ids?) Flattens annotations in a PDF document. diff --git a/src/nutrient_dws/client.py b/src/nutrient_dws/client.py index e85ccd8..b0cbd29 100644 --- a/src/nutrient_dws/client.py +++ b/src/nutrient_dws/client.py @@ -768,6 +768,22 @@ async def parse( ) -> ParseResponse: """Parse a document using the Data Extraction API (`/extraction/parse`). + Designed for content-extraction workflows where document content feeds + a downstream pipeline rather than being rendered or transformed: + + - **RAG / search indexing / content migration** — use + `output_format="markdown"` for a whole-document Markdown string + suitable for chunking, embedding, and indexing. + - **Form / invoice extraction** — use `output_format="spatial"` + (default) for a typed element list (paragraphs, tables, + keyValueRegions, etc.) with bounds and confidence per element. + - **Layout-aware document understanding** — combine `mode="understand"` + or `mode="agentic"` with spatial output for layout reconstruction + and semantic classification. + + See the README's Data Extraction section for worked recipes (RAG + ingestion, form extraction) and per-mode positioning. + The Data Extraction API is billed against **extraction credits**, which are a separate billing bucket from the **processor API credits** consumed by `/build`, `/sign`, OCR, and other Processor API endpoints. From 265c9c103d36afe06296c477c963d4cea05e2ddd Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 21:21:50 +1200 Subject: [PATCH 5/6] feat(client): route parse() via DWS Extract key and reject text+spatial MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DWS Extract is a separate product from DWS Processor with its own API key and credit pool. Calling /extraction/parse with the Processor key returns 403. Add an optional extract_api_key constructor parameter (str or async callable) that parse() prefers over api_key when set; non-parse methods keep using api_key. Falling back to api_key keeps a single-key setup working once tenants get global DWS keys. Also reject mode='text' + output_format='spatial' before the request goes out — the text mode only produces markdown, so the combination would 502 on the server side. Surface it as a ValidationError with guidance. Addresses PR #47 review feedback from HungKNguyen. --- README.md | 18 ++++++- src/nutrient_dws/client.py | 66 ++++++++++++++++++++++-- src/nutrient_dws/http.py | 3 ++ tests/conftest.py | 9 +++- tests/unit/test_client.py | 7 ++- tests/unit/test_parse.py | 100 ++++++++++++++++++++++++++++++++++++- 6 files changed, 193 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 0bb21c3..3bdf9f2 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,15 @@ For a complete list of available methods with examples, see the [Methods Documen **content-extraction workflows** where you need to feed document content into a downstream pipeline rather than render or transform the document itself: +> **Heads up — separate API key.** DWS Extract is a different product from +> DWS Processor and has its own API key. Pass it as +> `NutrientClient(api_key=..., extract_api_key=...)`; the Extract key is +> used only for `parse()`, while every other method continues to use the +> Processor key. Using the Processor key against `/extraction/parse` +> returns `403`. If `extract_api_key` is omitted, `parse()` falls back to +> the main `api_key` — that path works once your tenant moves to global +> DWS API keys. + - **RAG (retrieval-augmented generation) pipelines** — pull a clean Markdown representation of a document for chunking, embedding, and indexing in a vector store. @@ -114,6 +123,10 @@ downstream pipeline rather than render or transform the document itself: | `markdown` | RAG, search indexing, content migration — anywhere structured text beats spatial data | One whole-document Markdown string at `response['output']['markdown']` | | `spatial` (default) | Form/invoice extraction, layout reconstruction, flows that need per-element confidence | Flat list of typed elements at `response['output']['elements']` | +Spatial output requires an OCR-capable mode (`structure`, `understand`, or +`agentic`); `mode='text'` is markdown-only and the client rejects the +`text` + `spatial` combination before the request goes out. + ### Quick start ```python @@ -121,7 +134,10 @@ import asyncio from nutrient_dws import NutrientClient async def main(): - client = NutrientClient(api_key='your_api_key') + client = NutrientClient( + api_key='your_processor_key', + extract_api_key='your_extract_key', + ) # Spatial elements (default) — paragraphs, tables, formulas, pictures, etc. response = await client.parse('contract.pdf', mode='understand') diff --git a/src/nutrient_dws/client.py b/src/nutrient_dws/client.py index b0cbd29..5d6fcd5 100644 --- a/src/nutrient_dws/client.py +++ b/src/nutrient_dws/client.py @@ -117,6 +117,16 @@ async def get_token(): client = NutrientClient(api_key=get_token) ``` + + Data Extraction requires a separate DWS Extract API key — supply it + alongside the Processor key: + + ```python + client = NutrientClient( + api_key='your_processor_key', + extract_api_key='your_extract_key', + ) + ``` """ def __init__( @@ -124,19 +134,30 @@ def __init__( api_key: str | Callable[[], str | Awaitable[str]], base_url: str | None = None, timeout: int | None = None, + extract_api_key: str | Callable[[], str | Awaitable[str]] | None = None, ) -> None: """Create a new NutrientClient instance. Args: - api_key: API key or API key getter + api_key: API key or API key getter for the DWS Processor product + (used by every method except `parse()`). base_url: DWS Base url timeout: DWS request timeout + extract_api_key: Optional API key or getter for the DWS Extract + product. Required by `parse()` because DWS Extract is a + separate product with its own credit pool and API key — using + the Processor key will return 403. If omitted, `parse()` + falls back to `api_key`, which works once DWS rolls out + global API keys. Raises: ValidationError: If options are invalid """ options = NutrientClientOptions( - apiKey=api_key, baseUrl=base_url, timeout=timeout + apiKey=api_key, + baseUrl=base_url, + timeout=timeout, + extractApiKey=extract_api_key, ) self._validate_options(options) self.options = options @@ -166,6 +187,14 @@ def _validate_options(self, options: NutrientClientOptions) -> None: if base_url is not None and not isinstance(base_url, str): raise ValidationError("Base URL must be a string") + extract_api_key = options.get("extractApiKey") + if extract_api_key is not None and not ( + isinstance(extract_api_key, str) or callable(extract_api_key) + ): + raise ValidationError( + "Extract API key must be a string or a function that returns a string" + ) + async def get_account_info(self) -> AccountInfo: """Get account information for the current API key. @@ -784,6 +813,11 @@ async def parse( See the README's Data Extraction section for worked recipes (RAG ingestion, form extraction) and per-mode positioning. + DWS Extract is a separate product from DWS Processor and uses its own + API key. Pass it via `NutrientClient(extract_api_key=...)`. If omitted + the method falls back to the main `api_key`, which only succeeds when + the key is a global DWS key. + The Data Extraction API is billed against **extraction credits**, which are a separate billing bucket from the **processor API credits** consumed by `/build`, `/sign`, OCR, and other Processor API endpoints. @@ -803,7 +837,9 @@ async def parse( - `spatial` (default): `output.elements` — typed elements (paragraph, table, formula, picture, keyValueRegion, handwriting) with bounds, - confidence, and reading order. + confidence, and reading order. Requires an OCR-capable mode + (`structure`, `understand`, or `agentic`); `text` mode does not + produce spatial output. - `markdown`: `output.markdown` — a whole-document Markdown string, well suited for RAG / search indexing pipelines. @@ -822,12 +858,17 @@ async def parse( to `"structure"`. output_format: Output shape — `"spatial"` for typed elements or `"markdown"` for a Markdown document. Defaults to - `"spatial"`. + `"spatial"`. `mode="text"` is incompatible with + `output_format="spatial"`. Returns: The full parse response envelope, including `output`, `metrics`, `usage` (the extraction-credit accounting), and `configuration`. + Raises: + ValidationError: If `mode="text"` is combined with + `output_format="spatial"`. + Example: ```python # Spatial elements with full layout analysis (9 extraction credits / page) @@ -848,6 +889,13 @@ async def parse( f"(remaining: {usage['remainingCredits']})") ``` """ + if mode == "text" and output_format == "spatial": + raise ValidationError( + "mode='text' is not supported with output_format='spatial'. " + "Use output_format='markdown', or choose mode='structure' / " + "'understand' / 'agentic' for spatial elements." + ) + # Multipart-only endpoint; only local file inputs are supported. normalized_file = await process_file_input(file) @@ -861,6 +909,14 @@ async def parse( "instructions": instructions, } + # DWS Extract uses a separate API key. Route the request via a + # per-call options copy so the rest of the client (which talks to + # the Processor API) keeps using the main key. + parse_options = self.options.copy() + extract_key = parse_options.get("extractApiKey") + if extract_key is not None: + parse_options["apiKey"] = extract_key + response: Any = await send_request( { "method": "POST", @@ -868,7 +924,7 @@ async def parse( "data": request_data, "headers": None, }, - self.options, + parse_options, ) return cast("ParseResponse", response["data"]) diff --git a/src/nutrient_dws/http.py b/src/nutrient_dws/http.py index ab55550..8760c46 100644 --- a/src/nutrient_dws/http.py +++ b/src/nutrient_dws/http.py @@ -190,6 +190,9 @@ class NutrientClientOptions(TypedDict): apiKey: str | Callable[[], str | Awaitable[str]] baseUrl: str | None timeout: int | None + # DWS Extract is a separate product with its own API key; parse() prefers + # this when set, otherwise falls back to apiKey. + extractApiKey: NotRequired[str | Callable[[], str | Awaitable[str]] | None] async def resolve_api_key(api_key: str | Callable[[], str | Awaitable[str]]) -> str: diff --git a/tests/conftest.py b/tests/conftest.py index 9c8f10f..0d9f571 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,11 @@ from unittest.mock import AsyncMock import pytest + from nutrient_dws import NutrientClient from tests.helpers import TestDocumentGenerator + @pytest.fixture def mock_workflow_instance(): """Create a mock workflow instance for testing.""" @@ -40,7 +42,12 @@ def mock_workflow_instance(): @pytest.fixture def valid_client_options(): """Valid client options for testing.""" - return {"apiKey": "test-api-key", "baseUrl": "https://api.test.com/v1", "timeout": None} + return { + "apiKey": "test-api-key", + "baseUrl": "https://api.test.com/v1", + "timeout": None, + "extractApiKey": None, + } @pytest.fixture def unit_client(): diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 995c16a..74be20a 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -65,7 +65,12 @@ def test_create_workflow_instance( @patch("nutrient_dws.client.StagedWorkflowBuilder") def test_pass_client_options_to_workflow(self, mock_staged_workflow_builder): - custom_options = {"apiKey": "custom-key", "baseUrl": "https://custom.api.com", "timeout": None} + custom_options = { + "apiKey": "custom-key", + "baseUrl": "https://custom.api.com", + "timeout": None, + "extractApiKey": None, + } client = NutrientClient(api_key=custom_options["apiKey"], base_url=custom_options["baseUrl"]) client.workflow() diff --git a/tests/unit/test_parse.py b/tests/unit/test_parse.py index d669722..a3c58dd 100644 --- a/tests/unit/test_parse.py +++ b/tests/unit/test_parse.py @@ -167,6 +167,97 @@ def test_prepare_request_body_omits_instructions_when_absent(self) -> None: assert "data" not in prepared +class TestParseClientSideValidation: + """Combinations rejected before any network round-trip.""" + + @pytest.mark.asyncio + async def test_text_mode_with_spatial_output_raises( + self, parse_client: NutrientClient + ) -> None: + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + with pytest.raises(ValidationError, match="mode='text'"): + await parse_client.parse( + b"%PDF-1.7", mode="text", output_format="spatial" + ) + send.assert_not_called() + + +class TestParseApiKeyRouting: + """`/extraction/parse` is served by DWS Extract, which uses a separate + API key from DWS Processor. When `extract_api_key` is set we route via + that key; otherwise we fall back to the main `api_key`. + """ + + @pytest.mark.asyncio + async def test_parse_uses_extract_api_key_when_set(self) -> None: + client = NutrientClient( + api_key="processor-key", + extract_api_key="extract-key", + ) + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.return_value = _make_response( + {"status": 200, "requestId": "r", "output": {"elements": []}} + ) + + await client.parse(b"%PDF-1.7", mode="structure") + + sent_options = send.call_args[0][1] + + assert sent_options["apiKey"] == "extract-key" + # The client's own options are untouched — other methods still see the + # processor key. + assert client.options["apiKey"] == "processor-key" + + @pytest.mark.asyncio + async def test_parse_falls_back_to_main_api_key_when_extract_key_unset( + self, parse_client: NutrientClient + ) -> None: + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.return_value = _make_response( + {"status": 200, "requestId": "r", "output": {"elements": []}} + ) + + await parse_client.parse(b"%PDF-1.7", mode="structure") + + sent_options = send.call_args[0][1] + + assert sent_options["apiKey"] == "pdf_test_unit" + + @pytest.mark.asyncio + async def test_non_parse_methods_keep_processor_key(self) -> None: + """A sibling endpoint (`/account/info`) must not see the Extract + key — only `parse()` swaps. + """ + client = NutrientClient( + api_key="processor-key", + extract_api_key="extract-key", + ) + with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send: + send.return_value = _make_response({"subscriptionType": "live"}) + + await client.get_account_info() + + sent_options = send.call_args[0][1] + + assert sent_options["apiKey"] == "processor-key" + + def test_invalid_extract_api_key_type_raises(self) -> None: + with pytest.raises( + ValidationError, + match="Extract API key must be a string or a function that returns a string", + ): + NutrientClient(api_key="processor-key", extract_api_key=123) # type: ignore[arg-type] + + def test_async_extract_api_key_callable_accepted(self) -> None: + async def get_extract_key() -> str: + return "async-extract-key" + + client = NutrientClient( + api_key="processor-key", extract_api_key=get_extract_key + ) + assert callable(client.options["extractApiKey"]) + + class TestParseResponseHandling: """Verify the client returns the raw response envelope to the caller.""" @@ -315,7 +406,9 @@ async def test_authentication_error_propagates( ) with pytest.raises(AuthenticationError) as exc_info: - await parse_client.parse(b"%PDF-1.7", mode="text") + await parse_client.parse( + b"%PDF-1.7", mode="text", output_format="markdown" + ) assert exc_info.value.status_code == 401 assert (exc_info.value.details or {}).get("requestId") == "req_e_401" @@ -342,7 +435,10 @@ async def test_validation_error_propagates( with pytest.raises(ValidationError) as exc_info: await parse_client.parse( - b"%PDF-1.7", mode="text" # mode is fine; server-side fail + # client-side validation passes; failure is the mocked server response + b"%PDF-1.7", + mode="text", + output_format="markdown", ) details = exc_info.value.details or {} From 64a31599f17735d48c729afdee3cf87b3c01560f Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 21:34:04 +1200 Subject: [PATCH 6/6] fix(types): align ParsePageRef TypedDict shape with its docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docstring promises pageIndex/width/height are always populated and only pageNumber may be absent, but the class was declared `total=False`, which contradicts that and forces type-strict callers to guard every subscript access on guaranteed-present fields. Switch to the default (`total=True`) shape with pageNumber explicitly `NotRequired`, matching the precedent set by ParseBounds in the same module. No runtime impact — the wire already populates these fields. --- src/nutrient_dws/types/parse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nutrient_dws/types/parse.py b/src/nutrient_dws/types/parse.py index bf5df61..80dc98d 100644 --- a/src/nutrient_dws/types/parse.py +++ b/src/nutrient_dws/types/parse.py @@ -12,7 +12,7 @@ from typing import Literal -from typing_extensions import TypedDict +from typing_extensions import NotRequired, TypedDict from nutrient_dws.types.extraction_credits import ExtractionCredits @@ -75,7 +75,7 @@ class ParseBounds(TypedDict): height: float -class ParsePageRef(TypedDict, total=False): +class ParsePageRef(TypedDict): """Reference to the page an element was extracted from. `pageIndex` and dimensions are always populated; `pageNumber` carries the @@ -84,9 +84,9 @@ class ParsePageRef(TypedDict, total=False): """ pageIndex: int - pageNumber: str width: float height: float + pageNumber: NotRequired[str] class ParseWord(TypedDict, total=False):