We've self hosted our model on vllm, it's API is openai compatible.
Here's our client for reference - we send images as base64 encoded, we're not using any wrappers but we can easily use one if it makes things easier:
class VLMClient:
def __init__(self, vlm_model: str = VLM_MODEL, vllm_url: str = VLLM_URL):
self._vlm_model = vlm_model
self._vllm_client = httpx.AsyncClient(base_url=vllm_url)
if VLLM_HEALTHCHECK:
wait_for_ready(
server_url=vllm_url,
wait_seconds=VLLM_READY_TIMEOUT,
health_endpoint="health",
)
@property
def vlm_model(self) -> str:
return self._vlm_model
async def __call__(
self,
prompt: str,
image_bytes: bytes | None = None,
image_filetype: filetype.Type | None = None,
max_tokens: int = 10,
) -> str:
# Assemble the message content
message_content: list[dict[str, str | dict]] = [
{
"type": "text",
"text": prompt,
}
]
if image_bytes is not None:
if image_filetype is None:
image_filetype = filetype.guess(image_bytes)
if image_filetype is None:
raise ValueError("Could not determine image filetype")
if image_filetype not in ALLOWED_IMAGE_TYPES:
raise ValueError(
f"Image type {image_filetype} is not supported. Allowed types: {ALLOWED_IMAGE_TYPES}"
)
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
message_content.append(
{
"type": "image_url",
"image_url": {
"url": f"data:{image_filetype.mime};base64,{image_b64}",
},
}
)
# Put together the request payload
payload = {
"model": self.vlm_model,
"messages": [{"role": "user", "content": message_content}],
"max_tokens": max_tokens,
# "logprobs": True,
# "top_logprobs": 1,
}
response = await self._vllm_client.post("/v1/chat/completions", json=payload)
response = response.json()
response_text: str = (
response.get("choices")[0].get("message", {}).get("content", "").strip()
)
return response_text