The Nexus LLM router provides an OpenAI-compatible API that works with any OpenAI client library or HTTP client.

GET /llm/v1/models

Returns all configured models:

{ "object": "list", "data": [ { "id": "openai/gpt-4", "object": "model", "owned_by": "openai" }, { "id": "anthropic/claude-3-5-sonnet-20241022", "object": "model", "owned_by": "anthropic" } ] }
POST /llm/v1/chat/completions

Standard OpenAI chat completions format with provider-prefixed models.

curl -X POST http://localhost:8000/llm/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "openai/gpt-4", "messages": [ {"role": "user", "content": "Hello!"} ] }'
curl -X POST http://localhost:8000/llm/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "anthropic/claude-3-5-sonnet-20241022", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Explain quantum computing in simple terms."} ], "temperature": 0.7, "max_tokens": 500 }'
curl -X POST http://localhost:8000/llm/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "openai/gpt-4", "messages": [ {"role": "user", "content": "Write a short story"} ], "stream": true }'
from openai import OpenAI # Configure the client client = OpenAI( base_url="http://localhost:8000/llm/v1", api_key="not-used" # Required by SDK but ignored by Nexus ) # Simple completion response = client.chat.completions.create( model="openai/gpt-4", messages=[ {"role": "user", "content": "Hello, how are you?"} ] ) print(response.choices[0].message.content) # Streaming stream = client.chat.completions.create( model="anthropic/claude-3-5-sonnet-20241022", messages=[ {"role": "user", "content": "Tell me a story"} ], stream=True ) for chunk in stream: if chunk.choices[0].delta.content: print(chunk.choices[0].delta.content, end="")
import OpenAI from 'openai'; // Configure the client const openai = new OpenAI({ baseURL: 'http://localhost:8000/llm/v1', apiKey: 'not-used' // Required by SDK but ignored }); // Async/await usage async function chat() { const completion = await openai.chat.completions.create({ model: 'openai/gpt-4', messages: [ { role: 'user', content: 'What is the capital of France?' } ] }); console.log(completion.choices[0].message.content); } // Streaming async function streamChat() { const stream = await openai.chat.completions.create({ model: 'anthropic/claude-3-5-sonnet-20241022', messages: [{ role: 'user', content: 'Write a haiku' }], stream: true }); for await (const chunk of stream) { process.stdout.write(chunk.choices[0]?.delta?.content || ''); } }
from langchain_openai import ChatOpenAI from langchain_core.messages import HumanMessage # Configure for Nexus llm = ChatOpenAI( base_url="http://localhost:8000/llm/v1", api_key="not-used", model="openai/gpt-4" ) # Use with Langchain response = llm.invoke([ HumanMessage(content="What's the weather like?") ]) print(response.content)
import requests import json def call_nexus(model, messages, **kwargs): response = requests.post( "http://localhost:8000/llm/v1/chat/completions", headers={"Content-Type": "application/json"}, json={ "model": model, "messages": messages, **kwargs } ) return response.json() # Use any model through the same endpoint result = call_nexus( "google/gemini-1.5-pro", [{"role": "user", "content": "Hello!"}], temperature=0.5 ) print(result["choices"][0]["message"]["content"])

Tool calling is now supported across multiple providers including OpenAI, Anthropic, Google, and AWS Bedrock. The API remains consistent across all providers.

from openai import OpenAI client = OpenAI( base_url="http://localhost:8000/llm/v1", api_key="not-used" ) # Define a function/tool tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get the weather for a location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "City and state" } }, "required": ["location"] } } } ] # Works with OpenAI models response = client.chat.completions.create( model="openai/gpt-4", messages=[ {"role": "user", "content": "What's the weather in San Francisco?"} ], tools=tools, tool_choice="auto" ) # Also works with Anthropic models via Bedrock response = client.chat.completions.create( model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0", messages=[ {"role": "user", "content": "What's the weather in San Francisco?"} ], tools=tools, tool_choice="auto" ) # And with Google models response = client.chat.completions.create( model="google/gemini-1.5-pro", messages=[ {"role": "user", "content": "What's the weather in San Francisco?"} ], tools=tools, tool_choice="auto" ) # Check if model wants to call a function if response.choices[0].message.tool_calls: tool_call = response.choices[0].message.tool_calls[0] print(f"Function: {tool_call.function.name}") print(f"Arguments: {tool_call.function.arguments}")

Note on Bedrock Tool Support: While most Bedrock models support tools well, some models like Llama may have inconsistent tool calling behavior. Claude models via Bedrock provide the most reliable tool support.

When OAuth2 is enabled on the server:

from openai import OpenAI client = OpenAI( base_url="http://localhost:8000/llm/v1", api_key="not-used", default_headers={ "Authorization": "Bearer your-oauth2-token" } )

When token forwarding is enabled:

client = OpenAI( base_url="http://localhost:8000/llm/v1", api_key="not-used", default_headers={ "X-Provider-API-Key": "sk-your-actual-api-key" } )

Both OAuth2 and user API key:

client = OpenAI( base_url="http://localhost:8000/llm/v1", api_key="not-used", default_headers={ "Authorization": "Bearer oauth-token", "X-Provider-API-Key": "sk-user-api-key" } )
from openai import OpenAI, OpenAIError client = OpenAI( base_url="http://localhost:8000/llm/v1", api_key="not-used" ) try: response = client.chat.completions.create( model="openai/gpt-4", messages=[{"role": "user", "content": "Hello"}] ) except OpenAIError as e: if e.status_code == 429: print("Rate limit exceeded, please wait") elif e.status_code == 404: print("Model not found") elif e.status_code == 401: print("Authentication failed") else: print(f"Error: {e}")
try { const completion = await openai.chat.completions.create({ model: 'openai/gpt-4', messages: [{ role: 'user', content: 'Hello' }] }); } catch (error) { if (error.status === 429) { console.log('Rate limit exceeded'); } else if (error.status === 404) { console.log('Model not found'); } else { console.error('Error:', error.message); } }
import httpx from openai import OpenAI # Use custom HTTP client with connection pooling http_client = httpx.Client( limits=httpx.Limits(max_connections=100) ) client = OpenAI( base_url="http://localhost:8000/llm/v1", api_key="not-used", http_client=http_client )
import asyncio from openai import AsyncOpenAI async def main(): client = AsyncOpenAI( base_url="http://localhost:8000/llm/v1", api_key="not-used" ) # Concurrent requests tasks = [ client.chat.completions.create( model="openai/gpt-3.5-turbo", messages=[{"role": "user", "content": f"Count to {i}"}] ) for i in range(1, 6) ] responses = await asyncio.gather(*tasks) for response in responses: print(response.choices[0].message.content) asyncio.run(main())
# List all available models curl http://localhost:8000/llm/v1/models | jq '.data[].id'
# Simple test request curl -X POST http://localhost:8000/llm/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "openai/gpt-3.5-turbo", "messages": [{"role": "user", "content": "test"}], "max_tokens": 10 }' | jq '.'
# Run Nexus with debug logging nexus --log debug
  1. Use Model Aliases: Configure friendly names for models
  2. Handle Errors Gracefully: Implement proper error handling
  3. Set Timeouts: Configure appropriate timeouts for your use case
  4. Use Streaming: For long responses, use streaming to improve UX
  5. Cache Responses: Cache frequently requested completions
  6. Monitor Usage: Track token usage and costs per model
  7. Implement Retries: Use exponential backoff for transient errors
© Grafbase, Inc.