LLM Client Integration

The Nexus LLM router provides an OpenAI-compatible API that works with any OpenAI client library or HTTP client.

API Endpoints

List Models


GET /llm/openai/v1/models

Returns all configured models:


{
  "object": "list",
  "data": [
    {
      "id": "openai/gpt-4",
      "object": "model",
      "owned_by": "openai"
    },
    {
      "id": "anthropic/claude-3-5-sonnet-20241022",
      "object": "model",
      "owned_by": "anthropic"
    }
  ]
}

Chat Completions


POST /llm/openai/v1/chat/completions

Standard OpenAI chat completions format with provider-prefixed models.

Basic Examples

Simple Chat Completion


curl -X POST http://localhost:8000/llm/openai/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "openai/gpt-4",
    "messages": [
      {"role": "user", "content": "Hello!"}
    ]
  }'

With System Message


curl -X POST http://localhost:8000/llm/openai/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "anthropic/claude-3-5-sonnet-20241022",
    "messages": [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "Explain quantum computing in simple terms."}
    ],
    "temperature": 0.7,
    "max_tokens": 500
  }'

Streaming Response


curl -X POST http://localhost:8000/llm/openai/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "openai/gpt-4",
    "messages": [
      {"role": "user", "content": "Write a short story"}
    ],
    "stream": true
  }'

Client Library Integration

Python (OpenAI SDK)


from openai import OpenAI

# Configure the client
client = OpenAI(
    base_url="http://localhost:8000/llm/openai/v1",
    api_key="not-used"  # Required by SDK but ignored by Nexus
)

# Simple completion
response = client.chat.completions.create(
    model="openai/gpt-4",
    messages=[
        {"role": "user", "content": "Hello, how are you?"}
    ]
)

print(response.choices[0].message.content)

# Streaming
stream = client.chat.completions.create(
    model="anthropic/claude-3-5-sonnet-20241022",
    messages=[
        {"role": "user", "content": "Tell me a story"}
    ],
    stream=True
)

for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="")

JavaScript/TypeScript


import OpenAI from 'openai';

// Configure the client
const openai = new OpenAI({
  baseURL: 'http://localhost:8000/llm/openai/v1',
  apiKey: 'not-used'  // Required by SDK but ignored
});

// Async/await usage
async function chat() {
  const completion = await openai.chat.completions.create({
    model: 'openai/gpt-4',
    messages: [
      { role: 'user', content: 'What is the capital of France?' }
    ]
  });

  console.log(completion.choices[0].message.content);
}

// Streaming
async function streamChat() {
  const stream = await openai.chat.completions.create({
    model: 'anthropic/claude-3-5-sonnet-20241022',
    messages: [{ role: 'user', content: 'Write a haiku' }],
    stream: true
  });

  for await (const chunk of stream) {
    process.stdout.write(chunk.choices[0]?.delta?.content || '');
  }
}

Langchain Integration


from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage

# Configure for Nexus
llm = ChatOpenAI(
    base_url="http://localhost:8000/llm/openai/v1",
    api_key="not-used",
    model="openai/gpt-4"
)

# Use with Langchain
response = llm.invoke([
    HumanMessage(content="What's the weather like?")
])

print(response.content)

Custom HTTP Client


import requests
import json

def call_nexus(model, messages, **kwargs):
    response = requests.post(
        "http://localhost:8000/llm/openai/v1/chat/completions",
        headers={"Content-Type": "application/json"},
        json={
            "model": model,
            "messages": messages,
            **kwargs
        }
    )
    return response.json()

# Use any model through the same endpoint
result = call_nexus(
    "google/gemini-1.5-pro",
    [{"role": "user", "content": "Hello!"}],
    temperature=0.5
)

print(result["choices"][0]["message"]["content"])

Advanced Features

Function Calling / Tool Use

Tool calling is now supported across multiple providers including OpenAI, Anthropic, Google, and AWS Bedrock. The API remains consistent across all providers.


from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/llm/openai/v1",
    api_key="not-used"
)

# Define a function/tool
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "City and state"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# Works with OpenAI models
response = client.chat.completions.create(
    model="openai/gpt-4",
    messages=[
        {"role": "user", "content": "What's the weather in San Francisco?"}
    ],
    tools=tools,
    tool_choice="auto"
)

# Also works with Anthropic models via Bedrock
response = client.chat.completions.create(
    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
    messages=[
        {"role": "user", "content": "What's the weather in San Francisco?"}
    ],
    tools=tools,
    tool_choice="auto"
)

# And with Google models
response = client.chat.completions.create(
    model="google/gemini-1.5-pro",
    messages=[
        {"role": "user", "content": "What's the weather in San Francisco?"}
    ],
    tools=tools,
    tool_choice="auto"
)

# Check if model wants to call a function
if response.choices[0].message.tool_calls:
    tool_call = response.choices[0].message.tool_calls[0]
    print(f"Function: {tool_call.function.name}")
    print(f"Arguments: {tool_call.function.arguments}")

Note on Bedrock Tool Support: While most Bedrock models support tools well, some models like Llama may have inconsistent tool calling behavior. Claude models via Bedrock provide the most reliable tool support.

Authentication

With OAuth2

When OAuth2 is enabled on the server:


from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/llm/openai/v1",
    api_key="not-used",
    default_headers={
        "Authorization": "Bearer your-oauth2-token"
    }
)

With User API Keys (Token Forwarding)

When token forwarding is enabled:


client = OpenAI(
    base_url="http://localhost:8000/llm/openai/v1",
    api_key="not-used",
    default_headers={
        "X-Provider-API-Key": "sk-your-actual-api-key"
    }
)

Combined Authentication

Both OAuth2 and user API key:


client = OpenAI(
    base_url="http://localhost:8000/llm/openai/v1",
    api_key="not-used",
    default_headers={
        "Authorization": "Bearer oauth-token",
        "X-Provider-API-Key": "sk-user-api-key"
    }
)

Error Handling

Python Example


from openai import OpenAI, OpenAIError

client = OpenAI(
    base_url="http://localhost:8000/llm/openai/v1",
    api_key="not-used"
)

try:
    response = client.chat.completions.create(
        model="openai/gpt-4",
        messages=[{"role": "user", "content": "Hello"}]
    )
except OpenAIError as e:
    if e.status_code == 429:
        print("Rate limit exceeded, please wait")
    elif e.status_code == 404:
        print("Model not found")
    elif e.status_code == 401:
        print("Authentication failed")
    else:
        print(f"Error: {e}")

JavaScript Example


try {
  const completion = await openai.chat.completions.create({
    model: 'openai/gpt-4',
    messages: [{ role: 'user', content: 'Hello' }]
  });
} catch (error) {
  if (error.status === 429) {
    console.log('Rate limit exceeded');
  } else if (error.status === 404) {
    console.log('Model not found');
  } else {
    console.error('Error:', error.message);
  }
}

Performance Tips

Connection Pooling


import httpx
from openai import OpenAI

# Use custom HTTP client with connection pooling
http_client = httpx.Client(
    limits=httpx.Limits(max_connections=100)
)

client = OpenAI(
    base_url="http://localhost:8000/llm/openai/v1",
    api_key="not-used",
    http_client=http_client
)

Async Operations


import asyncio
from openai import AsyncOpenAI

async def main():
    client = AsyncOpenAI(
        base_url="http://localhost:8000/llm/openai/v1",
        api_key="not-used"
    )

    # Concurrent requests
    tasks = [
        client.chat.completions.create(
            model="openai/gpt-3.5-turbo",
            messages=[{"role": "user", "content": f"Count to {i}"}]
        )
        for i in range(1, 6)
    ]

    responses = await asyncio.gather(*tasks)
    for response in responses:
        print(response.choices[0].message.content)

asyncio.run(main())

Testing and Debugging

Check Available Models


# List all available models
curl http://localhost:8000/llm/openai/v1/models | jq '.data[].id'

Test Basic Connectivity


# Simple test request
curl -X POST http://localhost:8000/llm/openai/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "openai/gpt-3.5-turbo",
    "messages": [{"role": "user", "content": "test"}],
    "max_tokens": 10
  }' | jq '.'

Enable Debug Logging


# Run Nexus with debug logging
nexus --log debug

Best Practices

Use Model Aliases: Configure friendly names for models
Handle Errors Gracefully: Implement proper error handling
Set Timeouts: Configure appropriate timeouts for your use case
Use Streaming: For long responses, use streaming to improve UX
Cache Responses: Cache frequently requested completions
Monitor Usage: Track token usage and costs per model
Implement Retries: Use exponential backoff for transient errors

Next Steps

Configure Rate Limiting to control usage
Enable Token Forwarding for user keys
Set up monitoring to track performance