The Nexus LLM router provides an OpenAI-compatible API that works with any OpenAI client library or HTTP client.
GET /llm/v1/models
Returns all configured models:
{
"object": "list",
"data": [
{
"id": "openai/gpt-4",
"object": "model",
"owned_by": "openai"
},
{
"id": "anthropic/claude-3-5-sonnet-20241022",
"object": "model",
"owned_by": "anthropic"
}
]
}
POST /llm/v1/chat/completions
Standard OpenAI chat completions format with provider-prefixed models.
curl -X POST http://localhost:8000/llm/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "openai/gpt-4",
"messages": [
{"role": "user", "content": "Hello!"}
]
}'
curl -X POST http://localhost:8000/llm/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "anthropic/claude-3-5-sonnet-20241022",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain quantum computing in simple terms."}
],
"temperature": 0.7,
"max_tokens": 500
}'
curl -X POST http://localhost:8000/llm/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "openai/gpt-4",
"messages": [
{"role": "user", "content": "Write a short story"}
],
"stream": true
}'
from openai import OpenAI
# Configure the client
client = OpenAI(
base_url="http://localhost:8000/llm/v1",
api_key="not-used" # Required by SDK but ignored by Nexus
)
# Simple completion
response = client.chat.completions.create(
model="openai/gpt-4",
messages=[
{"role": "user", "content": "Hello, how are you?"}
]
)
print(response.choices[0].message.content)
# Streaming
stream = client.chat.completions.create(
model="anthropic/claude-3-5-sonnet-20241022",
messages=[
{"role": "user", "content": "Tell me a story"}
],
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
import OpenAI from 'openai';
// Configure the client
const openai = new OpenAI({
baseURL: 'http://localhost:8000/llm/v1',
apiKey: 'not-used' // Required by SDK but ignored
});
// Async/await usage
async function chat() {
const completion = await openai.chat.completions.create({
model: 'openai/gpt-4',
messages: [
{ role: 'user', content: 'What is the capital of France?' }
]
});
console.log(completion.choices[0].message.content);
}
// Streaming
async function streamChat() {
const stream = await openai.chat.completions.create({
model: 'anthropic/claude-3-5-sonnet-20241022',
messages: [{ role: 'user', content: 'Write a haiku' }],
stream: true
});
for await (const chunk of stream) {
process.stdout.write(chunk.choices[0]?.delta?.content || '');
}
}
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
# Configure for Nexus
llm = ChatOpenAI(
base_url="http://localhost:8000/llm/v1",
api_key="not-used",
model="openai/gpt-4"
)
# Use with Langchain
response = llm.invoke([
HumanMessage(content="What's the weather like?")
])
print(response.content)
import requests
import json
def call_nexus(model, messages, **kwargs):
response = requests.post(
"http://localhost:8000/llm/v1/chat/completions",
headers={"Content-Type": "application/json"},
json={
"model": model,
"messages": messages,
**kwargs
}
)
return response.json()
# Use any model through the same endpoint
result = call_nexus(
"google/gemini-1.5-pro",
[{"role": "user", "content": "Hello!"}],
temperature=0.5
)
print(result["choices"][0]["message"]["content"])
Tool calling is now supported across multiple providers including OpenAI, Anthropic, Google, and AWS Bedrock. The API remains consistent across all providers.
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/llm/v1",
api_key="not-used"
)
# Define a function/tool
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state"
}
},
"required": ["location"]
}
}
}
]
# Works with OpenAI models
response = client.chat.completions.create(
model="openai/gpt-4",
messages=[
{"role": "user", "content": "What's the weather in San Francisco?"}
],
tools=tools,
tool_choice="auto"
)
# Also works with Anthropic models via Bedrock
response = client.chat.completions.create(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
messages=[
{"role": "user", "content": "What's the weather in San Francisco?"}
],
tools=tools,
tool_choice="auto"
)
# And with Google models
response = client.chat.completions.create(
model="google/gemini-1.5-pro",
messages=[
{"role": "user", "content": "What's the weather in San Francisco?"}
],
tools=tools,
tool_choice="auto"
)
# Check if model wants to call a function
if response.choices[0].message.tool_calls:
tool_call = response.choices[0].message.tool_calls[0]
print(f"Function: {tool_call.function.name}")
print(f"Arguments: {tool_call.function.arguments}")
Note on Bedrock Tool Support: While most Bedrock models support tools well, some models like Llama may have inconsistent tool calling behavior. Claude models via Bedrock provide the most reliable tool support.
When OAuth2 is enabled on the server:
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/llm/v1",
api_key="not-used",
default_headers={
"Authorization": "Bearer your-oauth2-token"
}
)
When token forwarding is enabled:
client = OpenAI(
base_url="http://localhost:8000/llm/v1",
api_key="not-used",
default_headers={
"X-Provider-API-Key": "sk-your-actual-api-key"
}
)
Both OAuth2 and user API key:
client = OpenAI(
base_url="http://localhost:8000/llm/v1",
api_key="not-used",
default_headers={
"Authorization": "Bearer oauth-token",
"X-Provider-API-Key": "sk-user-api-key"
}
)
from openai import OpenAI, OpenAIError
client = OpenAI(
base_url="http://localhost:8000/llm/v1",
api_key="not-used"
)
try:
response = client.chat.completions.create(
model="openai/gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
except OpenAIError as e:
if e.status_code == 429:
print("Rate limit exceeded, please wait")
elif e.status_code == 404:
print("Model not found")
elif e.status_code == 401:
print("Authentication failed")
else:
print(f"Error: {e}")
try {
const completion = await openai.chat.completions.create({
model: 'openai/gpt-4',
messages: [{ role: 'user', content: 'Hello' }]
});
} catch (error) {
if (error.status === 429) {
console.log('Rate limit exceeded');
} else if (error.status === 404) {
console.log('Model not found');
} else {
console.error('Error:', error.message);
}
}
import httpx
from openai import OpenAI
# Use custom HTTP client with connection pooling
http_client = httpx.Client(
limits=httpx.Limits(max_connections=100)
)
client = OpenAI(
base_url="http://localhost:8000/llm/v1",
api_key="not-used",
http_client=http_client
)
import asyncio
from openai import AsyncOpenAI
async def main():
client = AsyncOpenAI(
base_url="http://localhost:8000/llm/v1",
api_key="not-used"
)
# Concurrent requests
tasks = [
client.chat.completions.create(
model="openai/gpt-3.5-turbo",
messages=[{"role": "user", "content": f"Count to {i}"}]
)
for i in range(1, 6)
]
responses = await asyncio.gather(*tasks)
for response in responses:
print(response.choices[0].message.content)
asyncio.run(main())
# List all available models
curl http://localhost:8000/llm/v1/models | jq '.data[].id'
# Simple test request
curl -X POST http://localhost:8000/llm/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "openai/gpt-3.5-turbo",
"messages": [{"role": "user", "content": "test"}],
"max_tokens": 10
}' | jq '.'
# Run Nexus with debug logging
nexus --log debug
- Use Model Aliases: Configure friendly names for models
- Handle Errors Gracefully: Implement proper error handling
- Set Timeouts: Configure appropriate timeouts for your use case
- Use Streaming: For long responses, use streaming to improve UX
- Cache Responses: Cache frequently requested completions
- Monitor Usage: Track token usage and costs per model
- Implement Retries: Use exponential backoff for transient errors
- Configure Rate Limiting to control usage
- Enable Token Forwarding for user keys
- Set up monitoring to track performance