import os
import json
from unstructured_client import UnstructuredClient
from unstructured_client.models.operations import PartitionRequest
from unstructured_client.models.shared import (
PartitionParameters,
Files,
Strategy
)
from unstructured_client.models.errors import (
UnstructuredClientError,
HTTPValidationError
)
from unstructured_client.models.errors.servererror import ServerError
from unstructured_client.models.errors.responsevalidationerror import ResponseValidationError
import httpx
try:
client = UnstructuredClient(
# For example, intentionally leave out the API key to intentionally throw an error.
# api_key_auth=os.getenv("UNSTRUCTURED_API_KEY")
)
filename = "PATH_TO_INPUT_FILE"
request = PartitionRequest(
partition_parameters=PartitionParameters(
files=Files(
content=open(filename, "rb"),
file_name=filename,
),
strategy=Strategy.VLM,
vlm_model="gpt-4o",
vlm_model_provider="openai",
languages=['eng'],
split_pdf_page=True, # If True, splits the PDF file into smaller chunks of pages.
# split_pdf_allow_failed=True, # If True, the partitioning continues even if some pages fail.
split_pdf_concurrency_level=15 # Set the number of concurrent request to the maximum value: 15.
),
)
response = client.general.partition(
request=request
)
element_dicts = [element for element in response.elements]
# Print the processed data's first element only.
print(element_dicts[0])
# Write the processed data to a local file.
json_elements = json.dumps(element_dicts, indent=2)
with open("PATH_TO_OUTPUT_FILE", "w") as file:
file.write(json_elements)
except HTTPValidationError as e:
print("Validation error (HTTP 422):", e)
except ServerError as e:
print("Server error (HTTP 5XX):", e)
except ResponseValidationError as e:
print("Response validation/type mismatch:", e)
except UnstructuredClientError as e:
# This catches any other UnstructuredClientError not already caught above.
# This and all of the other error classes in this example expose the following members:
print("Other Unstructured client error:")
print(f"Message: {e.message}")
print(f"Status code: {e.status_code}")
print(f"Body: {e.body}")
print(f"Raw response: {e.raw_response}")
print(f"Headers:")
for header in e.headers.raw:
key = header[0].decode('utf-8')
value = header[1].decode('utf-8')
print(f" {key}: {value}")
except httpx.ConnectError as e:
print("HTTP connection error:", e)
except httpx.TimeoutException as e:
print("HTTP timeout error:", e)
except httpx.RequestError as e:
# This catches catch-all network errors from HTTP not already caught above.
print("Other HTTPX request error:", e)
except Exception as e:
# Optional: this catches any other unforeseen errors.
print("Unexpected error:", e)