To help models understand PDF content, we put into the model’s context both the extracted text and an image of each page. The model can then use both the text and the images to generate a response. This is useful, for example, if diagrams contain key information that isn’t in the text.
Example
import os
import base64
import requests
from openai import OpenAI
# Create a file in OpenAI
openai_client = OpenAI()
pdf_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" # This is a dummy PDF file, you can use any PDF file you want
response = requests.get(pdf_url)
file_data = response.content
file = openai_client.files.create(file=file_data, purpose="user_data")
# Create a client for KeywordsAI proxy
client = OpenAI(
base_url="https://api.keywordsai.co/api",
api_key=os.getenv("KEYWORDSAI_API_KEY_TEST"),
)
model = "gpt-4.1"
file_content = [
{"type": "text", "text": "What's this file about?"},
{
"type": "file",
"file": {
"file_id": file.id,
},
}
]
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": file_content,
}
],
)
assert response is not None