Examples
This page provides comprehensive examples of using MonoLLM for various tasks and scenarios.
Basic Examples
Simple Text Generation
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def simple_generation():
async with UnifiedLLMClient() as client:
config = RequestConfig(model="qwen-plus")
response = await client.generate(
"What are the benefits of renewable energy?",
config
)
print(response.content)
asyncio.run(simple_generation())
Streaming Response
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def streaming_example():
async with UnifiedLLMClient() as client:
config = RequestConfig(
model="qwen-plus",
stream=True
)
print("Response: ", end="", flush=True)
streaming_response = await client.generate_stream(
"Write a short poem about artificial intelligence.",
config
)
async for chunk in streaming_response:
if chunk.content:
print(chunk.content, end="", flush=True)
print() # New line
asyncio.run(streaming_example())
Advanced Examples
Reasoning with QwQ
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def reasoning_example():
async with UnifiedLLMClient() as client:
config = RequestConfig(
model="qwq-32b",
show_thinking=True,
max_tokens=1500
)
response = await client.generate(
"""
A farmer has 17 sheep. All but 9 die. How many sheep are left?
Think through this step by step.
""",
config
)
if response.thinking:
print("🤔 Thinking process:")
print(response.thinking)
print("\n" + "="*50 + "\n")
print("📝 Final answer:")
print(response.content)
asyncio.run(reasoning_example())
Multi-turn Conversation
import asyncio
from monollm import UnifiedLLMClient, RequestConfig, Message
async def conversation_example():
async with UnifiedLLMClient() as client:
config = RequestConfig(model="qwen-plus")
# Initialize conversation
messages = [
Message(role="system", content="You are a helpful coding assistant."),
Message(role="user", content="How do I create a list in Python?"),
]
# First exchange
response = await client.generate(messages, config)
print("Assistant:", response.content)
# Continue conversation
messages.append(Message(role="assistant", content=response.content))
messages.append(Message(
role="user",
content="Can you show me how to add items to that list?"
))
# Second exchange
response = await client.generate(messages, config)
print("Assistant:", response.content)
# Third exchange
messages.append(Message(role="assistant", content=response.content))
messages.append(Message(
role="user",
content="What about removing items?"
))
response = await client.generate(messages, config)
print("Assistant:", response.content)
asyncio.run(conversation_example())
Provider Comparison
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def compare_providers():
async with UnifiedLLMClient() as client:
prompt = "Explain machine learning in one paragraph."
# Test different models
models = [
("qwen-plus", "Qwen Plus"),
("qwq-32b", "QwQ 32B"),
("claude-3-5-sonnet-20241022", "Claude 3.5 Sonnet"),
("gpt-4o", "GPT-4o"),
]
for model_id, model_name in models:
try:
config = RequestConfig(model=model_id, max_tokens=200)
response = await client.generate(prompt, config)
print(f"\n{model_name}:")
print("-" * len(model_name))
print(response.content)
if response.usage:
print(f"Tokens: {response.usage.total_tokens}")
except Exception as e:
print(f"\n{model_name}: Error - {e}")
asyncio.run(compare_providers())
Use Case Examples
Content Generation
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def content_generation():
async with UnifiedLLMClient() as client:
config = RequestConfig(
model="qwen-plus",
temperature=0.8, # Higher creativity
max_tokens=1000
)
# Blog post generation
blog_prompt = """
Write a blog post about the future of electric vehicles.
Include:
- Current market trends
- Technological advances
- Environmental impact
- Challenges and opportunities
Make it engaging and informative for a general audience.
"""
response = await client.generate(blog_prompt, config)
print("Blog Post:")
print("=" * 50)
print(response.content)
asyncio.run(content_generation())
Code Generation and Review
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def code_assistance():
async with UnifiedLLMClient() as client:
config = RequestConfig(
model="qwq-32b", # Good for reasoning about code
temperature=0.2 # Lower temperature for code
)
# Code generation
code_prompt = """
Create a Python function that:
1. Takes a list of numbers
2. Removes duplicates
3. Sorts the list in descending order
4. Returns the top 3 numbers
Include error handling and docstring.
"""
response = await client.generate(code_prompt, config)
print("Generated Code:")
print("-" * 30)
print(response.content)
# Code review
review_prompt = """
Review this Python code and suggest improvements:
def process_data(data):
result = []
for item in data:
if item not in result:
result.append(item)
result.sort(reverse=True)
return result[:3]
"""
response = await client.generate(review_prompt, config)
print("\nCode Review:")
print("-" * 30)
print(response.content)
asyncio.run(code_assistance())
Data Analysis
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def data_analysis():
async with UnifiedLLMClient() as client:
config = RequestConfig(
model="qwq-32b",
show_thinking=True,
max_tokens=1500
)
analysis_prompt = """
Analyze this sales data and provide insights:
Q1 2024: $125,000 (15% increase from Q1 2023)
Q2 2024: $140,000 (8% increase from Q1 2024)
Q3 2024: $135,000 (3.6% decrease from Q2 2024)
Q4 2024: $160,000 (18.5% increase from Q3 2024)
Provide:
1. Trend analysis
2. Seasonal patterns
3. Growth rate calculations
4. Recommendations for Q1 2025
"""
response = await client.generate(analysis_prompt, config)
if response.thinking:
print("Analysis Process:")
print(response.thinking)
print("\n" + "="*50 + "\n")
print("Analysis Results:")
print(response.content)
asyncio.run(data_analysis())
Creative Writing
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def creative_writing():
async with UnifiedLLMClient() as client:
config = RequestConfig(
model="qwen-plus",
temperature=1.0, # Maximum creativity
max_tokens=2000
)
story_prompt = """
Write a short science fiction story (500-800 words) about:
- A world where AI and humans collaborate seamlessly
- A discovery that changes everything
- An unexpected friendship
Make it engaging with vivid descriptions and dialogue.
"""
response = await client.generate(story_prompt, config)
print("Science Fiction Story:")
print("=" * 50)
print(response.content)
if response.usage:
print(f"\nWord count estimate: ~{response.usage.completion_tokens * 0.75:.0f} words")
asyncio.run(creative_writing())
Error Handling Examples
Robust Error Handling
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
from monollm.core.exceptions import (
MonoLLMError,
ProviderError,
ConnectionError,
ConfigurationError
)
async def robust_generation(prompt: str, model: str):
"""Generate text with comprehensive error handling."""
async with UnifiedLLMClient() as client:
try:
config = RequestConfig(
model=model,
max_tokens=500,
temperature=0.7
)
response = await client.generate(prompt, config)
return response.content
except ConfigurationError as e:
print(f"Configuration error: {e}")
print("Please check your API keys and configuration.")
return None
except ConnectionError as e:
print(f"Connection error: {e}")
print("Please check your internet connection and proxy settings.")
return None
except ProviderError as e:
print(f"Provider error: {e}")
print("The AI provider encountered an error.")
return None
except MonoLLMError as e:
print(f"MonoLLM error: {e}")
return None
except Exception as e:
print(f"Unexpected error: {e}")
return None
async def main():
# Test with valid model
result = await robust_generation("Hello, world!", "qwen-plus")
if result:
print("Success:", result)
# Test with invalid model
result = await robust_generation("Hello, world!", "invalid-model")
if not result:
print("Handled invalid model gracefully")
asyncio.run(main())
Retry Logic
import asyncio
import time
from monollm import UnifiedLLMClient, RequestConfig
from monollm.core.exceptions import ProviderError
async def generate_with_retry(prompt: str, model: str, max_retries: int = 3):
"""Generate text with retry logic."""
async with UnifiedLLMClient() as client:
config = RequestConfig(model=model)
for attempt in range(max_retries):
try:
response = await client.generate(prompt, config)
return response.content
except ProviderError as e:
if attempt < max_retries - 1:
wait_time = 2 ** attempt # Exponential backoff
print(f"Attempt {attempt + 1} failed: {e}")
print(f"Retrying in {wait_time} seconds...")
await asyncio.sleep(wait_time)
else:
print(f"All {max_retries} attempts failed")
raise
except Exception as e:
# Don't retry for non-provider errors
print(f"Non-retryable error: {e}")
raise
async def main():
try:
result = await generate_with_retry(
"What is the capital of France?",
"qwen-plus"
)
print("Result:", result)
except Exception as e:
print("Final error:", e)
asyncio.run(main())
Performance Examples
Concurrent Requests
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def concurrent_generation():
"""Generate multiple responses concurrently."""
async with UnifiedLLMClient() as client:
config = RequestConfig(model="qwen-plus", max_tokens=100)
# Define multiple prompts
prompts = [
"What is artificial intelligence?",
"Explain quantum computing.",
"What are the benefits of renewable energy?",
"How does machine learning work?",
"What is blockchain technology?"
]
# Create tasks for concurrent execution
tasks = [
client.generate(prompt, config)
for prompt in prompts
]
# Execute all tasks concurrently
start_time = time.time()
responses = await asyncio.gather(*tasks)
end_time = time.time()
# Display results
for i, (prompt, response) in enumerate(zip(prompts, responses)):
print(f"\nQuestion {i+1}: {prompt}")
print(f"Answer: {response.content[:100]}...")
print(f"\nTotal time: {end_time - start_time:.2f} seconds")
print(f"Average time per request: {(end_time - start_time) / len(prompts):.2f} seconds")
import time
asyncio.run(concurrent_generation())
Batch Processing
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
async def batch_processing():
"""Process a batch of texts efficiently."""
async with UnifiedLLMClient() as client:
config = RequestConfig(
model="qwen-plus",
temperature=0.3,
max_tokens=50
)
# Sample data to process
texts = [
"The weather is beautiful today.",
"I love programming in Python.",
"Machine learning is fascinating.",
"The sunset was absolutely stunning.",
"Coffee helps me stay productive."
]
# Process in batches to avoid overwhelming the API
batch_size = 3
results = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
# Create tasks for this batch
tasks = [
client.generate(
f"Analyze the sentiment of this text: '{text}'",
config
)
for text in batch
]
# Process batch
batch_responses = await asyncio.gather(*tasks)
results.extend(batch_responses)
# Small delay between batches
if i + batch_size < len(texts):
await asyncio.sleep(0.5)
# Display results
for text, response in zip(texts, results):
print(f"Text: {text}")
print(f"Analysis: {response.content}")
print("-" * 50)
asyncio.run(batch_processing())
Integration Examples
Web Application Integration
# Example with FastAPI
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import asyncio
from monollm import UnifiedLLMClient, RequestConfig
app = FastAPI()
# Global client instance
llm_client = None
class GenerationRequest(BaseModel):
prompt: str
model: str = "qwen-plus"
temperature: float = 0.7
max_tokens: int = 500
class GenerationResponse(BaseModel):
content: str
model: str
provider: str
tokens_used: int
@app.on_event("startup")
async def startup_event():
global llm_client
llm_client = UnifiedLLMClient()
await llm_client.initialize()
@app.on_event("shutdown")
async def shutdown_event():
global llm_client
if llm_client:
await llm_client.close()
@app.post("/generate", response_model=GenerationResponse)
async def generate_text(request: GenerationRequest):
try:
config = RequestConfig(
model=request.model,
temperature=request.temperature,
max_tokens=request.max_tokens
)
response = await llm_client.generate(request.prompt, config)
return GenerationResponse(
content=response.content,
model=response.model,
provider=response.provider,
tokens_used=response.usage.total_tokens if response.usage else 0
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/models")
async def list_models():
models = llm_client.list_models()
return models
# Run with: uvicorn main:app --reload
CLI Application
#!/usr/bin/env python3
"""
Simple CLI application using MonoLLM
"""
import asyncio
import argparse
from monollm import UnifiedLLMClient, RequestConfig
async def interactive_chat():
"""Interactive chat session."""
async with UnifiedLLMClient() as client:
config = RequestConfig(model="qwen-plus")
print("MonoLLM Interactive Chat")
print("Type 'quit' to exit")
print("-" * 30)
while True:
try:
user_input = input("\nYou: ").strip()
if user_input.lower() in ['quit', 'exit', 'q']:
break
if not user_input:
continue
print("AI: ", end="", flush=True)
# Use streaming for better UX
config.stream = True
streaming_response = await client.generate_stream(user_input, config)
async for chunk in streaming_response:
if chunk.content:
print(chunk.content, end="", flush=True)
print() # New line
except KeyboardInterrupt:
print("\nGoodbye!")
break
except Exception as e:
print(f"Error: {e}")
async def single_generation(prompt: str, model: str):
"""Single text generation."""
async with UnifiedLLMClient() as client:
config = RequestConfig(model=model)
response = await client.generate(prompt, config)
print(response.content)
def main():
parser = argparse.ArgumentParser(description="MonoLLM CLI")
parser.add_argument("--interactive", "-i", action="store_true",
help="Start interactive chat")
parser.add_argument("--prompt", "-p", type=str,
help="Single prompt to process")
parser.add_argument("--model", "-m", type=str, default="qwen-plus",
help="Model to use")
args = parser.parse_args()
if args.interactive:
asyncio.run(interactive_chat())
elif args.prompt:
asyncio.run(single_generation(args.prompt, args.model))
else:
parser.print_help()
if __name__ == "__main__":
main()
Best Practices
Always use async context managers for proper resource management
Handle errors gracefully with appropriate exception handling
Use appropriate models for different tasks (reasoning vs. general)
Set reasonable token limits to control costs
Implement retry logic for production applications
Use streaming for better user experience with long responses
Cache responses when appropriate to reduce API calls
Monitor usage and implement rate limiting if needed
These examples demonstrate the flexibility and power of MonoLLM across various use cases. Adapt them to your specific needs and requirements.