Custom Backend Examples
This guide demonstrates how to effectively use different computational backends in Tensor Frame, including when to switch backends, performance optimization strategies, and mixed backend workflows.
Backend Selection Strategies
Automatic vs Manual Selection
use tensor_frame::{Tensor, BackendType, Result};
use std::time::Instant;
fn backend_selection_demo() -> Result<()> {
println!("=== Backend Selection Strategies ===\n");
// Automatic selection (recommended for most cases)
let auto_tensor = Tensor::zeros(vec![1000, 1000])?;
println!("Automatic backend selected: {:?}", auto_tensor.backend_type());
// Manual backend specification
let cpu_tensor = auto_tensor.to_backend(BackendType::Cpu)?;
println!("Forced CPU backend: {:?}", cpu_tensor.backend_type());
#[cfg(feature = "wgpu")]
{
match auto_tensor.to_backend(BackendType::Wgpu) {
Ok(wgpu_tensor) => {
println!("WGPU backend available: {:?}", wgpu_tensor.backend_type());
}
Err(e) => {
println!("WGPU backend not available: {}", e);
}
}
}
#[cfg(feature = "cuda")]
{
match auto_tensor.to_backend(BackendType::Cuda) {
Ok(cuda_tensor) => {
println!("CUDA backend available: {:?}", cuda_tensor.backend_type());
}
Err(e) => {
println!("CUDA backend not available: {}", e);
}
}
}
Ok(())
}
Size-Based Backend Selection
fn adaptive_backend_selection() -> Result<()> {
println!("=== Adaptive Backend Selection ===\n");
let sizes = vec![
(vec![10, 10], "tiny"),
(vec![100, 100], "small"),
(vec![1000, 1000], "medium"),
(vec![3000, 3000], "large"),
];
for (shape, description) in sizes {
let elements = shape.iter().product::<usize>();
// Choose backend based on tensor size
let backend = if elements < 1000 {
BackendType::Cpu // CPU overhead minimal for small tensors
} else if elements < 1_000_000 {
// Try WGPU first, fallback to CPU
#[cfg(feature = "wgpu")]
{ BackendType::Wgpu }
#[cfg(not(feature = "wgpu"))]
{ BackendType::Cpu }
} else {
// Large tensors: prefer CUDA > WGPU > CPU
#[cfg(feature = "cuda")]
{ BackendType::Cuda }
#[cfg(all(feature = "wgpu", not(feature = "cuda")))]
{ BackendType::Wgpu }
#[cfg(all(not(feature = "wgpu"), not(feature = "cuda")))]
{ BackendType::Cpu }
};
let tensor = Tensor::zeros(shape.clone())?;
let optimized_tensor = tensor.to_backend(backend)?;
println!("{} tensor {:?}: {} elements -> {:?} backend",
description, shape, elements, optimized_tensor.backend_type());
}
Ok(())
}
Performance Benchmarking
Backend Performance Comparison
fn benchmark_backends() -> Result<()> {
println!("=== Backend Performance Comparison ===\n");
let sizes = vec![
vec![100, 100],
vec![500, 500],
vec![1000, 1000],
vec![2000, 2000],
];
for size in sizes {
println!("Benchmarking {}x{} matrix addition:", size[0], size[1]);
// Create test tensors
let a = Tensor::ones(size.clone())?;
let b = Tensor::ones(size.clone())?;
// CPU benchmark
let cpu_a = a.to_backend(BackendType::Cpu)?;
let cpu_b = b.to_backend(BackendType::Cpu)?;
let start = Instant::now();
let cpu_result = &cpu_a + &cpu_b;
let cpu_time = start.elapsed();
println!(" CPU: {:?}", cpu_time);
// WGPU benchmark (if available)
#[cfg(feature = "wgpu")]
{
match (a.to_backend(BackendType::Wgpu), b.to_backend(BackendType::Wgpu)) {
(Ok(wgpu_a), Ok(wgpu_b)) => {
let start = Instant::now();
let wgpu_result = &wgpu_a + &wgpu_b;
// Force synchronization by converting back
let _sync = wgpu_result.to_vec()?;
let wgpu_time = start.elapsed();
let speedup = cpu_time.as_nanos() as f64 / wgpu_time.as_nanos() as f64;
println!(" WGPU: {:?} ({}x speedup)", wgpu_time, speedup);
}
_ => println!(" WGPU: Not available"),
}
}
// CUDA benchmark (if available)
#[cfg(feature = "cuda")]
{
match (a.to_backend(BackendType::Cuda), b.to_backend(BackendType::Cuda)) {
(Ok(cuda_a), Ok(cuda_b)) => {
let start = Instant::now();
let cuda_result = &cuda_a + &cuda_b;
let _sync = cuda_result.to_vec()?;
let cuda_time = start.elapsed();
let speedup = cpu_time.as_nanos() as f64 / cuda_time.as_nanos() as f64;
println!(" CUDA: {:?} ({}x speedup)", cuda_time, speedup);
}
_ => println!(" CUDA: Not available"),
}
}
println!();
}
Ok(())
}
Operation-Specific Benchmarks
fn operation_benchmarks() -> Result<()> {
println!("=== Operation-Specific Benchmarks ===\n");
let size = vec![1000, 1000];
let a = Tensor::ones(size.clone())?;
let b = Tensor::ones(size.clone())?;
// Test different operations
let operations = vec![
("Addition", |a: &Tensor, b: &Tensor| a + b),
("Multiplication", |a: &Tensor, b: &Tensor| a * b),
("Complex", |a: &Tensor, b: &Tensor| (a * 2.0) + b),
];
for (op_name, operation) in operations {
println!("Operation: {}", op_name);
// CPU timing
let cpu_a = a.to_backend(BackendType::Cpu)?;
let cpu_b = b.to_backend(BackendType::Cpu)?;
let start = Instant::now();
let _cpu_result = operation(&cpu_a, &cpu_b)?;
let cpu_time = start.elapsed();
println!(" CPU: {:?}", cpu_time);
// GPU timing (if available)
#[cfg(feature = "wgpu")]
{
if let (Ok(gpu_a), Ok(gpu_b)) = (
a.to_backend(BackendType::Wgpu),
b.to_backend(BackendType::Wgpu)
) {
let start = Instant::now();
let gpu_result = operation(&gpu_a, &gpu_b)?;
let _sync = gpu_result.to_vec()?; // Force sync
let gpu_time = start.elapsed();
let speedup = cpu_time.as_nanos() as f64 / gpu_time.as_nanos() as f64;
println!(" GPU: {:?} ({}x speedup)", gpu_time, speedup);
}
}
println!();
}
Ok(())
}
Mixed Backend Workflows
Pipeline with Backend Transitions
fn mixed_backend_pipeline() -> Result<()> {
println!("=== Mixed Backend Pipeline ===\n");
// Stage 1: Data preparation on CPU (I/O intensive)
println!("Stage 1: Data preparation on CPU");
let raw_data = vec![1.0; 1_000_000]; // Simulate data loading
let cpu_tensor = Tensor::from_vec(raw_data, vec![1000, 1000])?;
println!(" Created tensor on CPU: {:?}", cpu_tensor.backend_type());
// Stage 2: Heavy computation on GPU
#[cfg(feature = "wgpu")]
{
println!("Stage 2: Moving to GPU for computation");
let gpu_tensor = cpu_tensor.to_backend(BackendType::Wgpu)?;
println!(" Moved to GPU: {:?}", gpu_tensor.backend_type());
// Perform heavy computations on GPU
let processed = (&gpu_tensor * 2.0) + 1.0;
let normalized = &processed / processed.sum(None)?;
println!(" Completed GPU computations");
// Stage 3: Results back to CPU for output
println!("Stage 3: Moving results back to CPU");
let final_result = normalized.to_backend(BackendType::Cpu)?;
println!(" Final result on CPU: {:?}", final_result.backend_type());
// Stage 4: Extract specific values (CPU efficient)
let summary = final_result.sum(None)?;
println!(" Summary value: {}", summary.to_vec()?[0]);
}
#[cfg(not(feature = "wgpu"))]
{
println!("Stage 2-4: Processing on CPU (GPU not available)");
let processed = (&cpu_tensor * 2.0) + 1.0;
let summary = processed.sum(None)?;
println!(" Summary value: {}", summary.to_vec()?[0]);
}
Ok(())
}
Batch Processing Strategy
fn batch_processing_strategy() -> Result<()> {
println!("=== Batch Processing Strategy ===\n");
// Simulate multiple data batches
let batch_sizes = vec![100, 500, 1000, 2000];
for batch_size in batch_sizes {
println!("Processing batch size: {}", batch_size);
// Create multiple tensors (simulating data batches)
let batches: Result<Vec<_>> = (0..5)
.map(|i| {
let data = vec![i as f32; batch_size * batch_size];
Tensor::from_vec(data, vec![batch_size, batch_size])
})
.collect();
let batches = batches?;
// Choose optimal backend based on batch size
let backend = if batch_size < 500 {
BackendType::Cpu
} else {
#[cfg(feature = "wgpu")]
{ BackendType::Wgpu }
#[cfg(not(feature = "wgpu"))]
{ BackendType::Cpu }
};
let start = Instant::now();
// Convert all batches to optimal backend
let gpu_batches: Result<Vec<_>> = batches
.into_iter()
.map(|batch| batch.to_backend(backend))
.collect();
let gpu_batches = gpu_batches?;
// Process all batches
let results: Result<Vec<_>> = gpu_batches
.iter()
.map(|batch| batch.sum(None))
.collect();
let results = results?;
let processing_time = start.elapsed();
println!(" Backend: {:?}", backend);
println!(" Processing time: {:?}", processing_time);
println!(" Results count: {}", results.len());
println!();
}
Ok(())
}
Error Handling and Fallback Strategies
Robust Backend Selection
fn robust_backend_selection(tensor: Tensor) -> Result<Tensor> {
// Try backends in order of preference
let backends_to_try = vec![
#[cfg(feature = "cuda")]
BackendType::Cuda,
#[cfg(feature = "wgpu")]
BackendType::Wgpu,
BackendType::Cpu,
];
for backend in backends_to_try {
match tensor.to_backend(backend) {
Ok(converted_tensor) => {
println!("Successfully using backend: {:?}", backend);
return Ok(converted_tensor);
}
Err(e) => {
println!("Backend {:?} failed: {}", backend, e);
continue;
}
}
}
// This should never happen since CPU should always work
Err(tensor_frame::TensorError::BackendError(
"No backend available".to_string()
))
}
fn robust_operation_with_fallback() -> Result<()> {
println!("=== Robust Operation with Fallback ===\n");
let large_tensor = Tensor::ones(vec![2000, 2000])?;
// Try GPU operation first
let result = match large_tensor.to_backend(BackendType::Wgpu) {
Ok(gpu_tensor) => {
match gpu_tensor.sum(None) {
Ok(result) => {
println!("GPU operation successful");
result
}
Err(e) => {
println!("GPU operation failed: {}, falling back to CPU", e);
large_tensor.to_backend(BackendType::Cpu)?.sum(None)?
}
}
}
Err(e) => {
println!("GPU conversion failed: {}, using CPU", e);
large_tensor.sum(None)?
}
};
println!("Final result: {}", result.to_vec()?[0]);
Ok(())
}
Memory Management Across Backends
fn memory_management_demo() -> Result<()> {
println!("=== Memory Management Across Backends ===\n");
// Monitor memory usage pattern
let tensor_size = vec![1000, 1000]; // 4MB tensor
// Start with CPU
let cpu_tensor = Tensor::ones(tensor_size.clone())?;
println!("Created tensor on CPU");
// Convert to GPU (allocates GPU memory)
#[cfg(feature = "wgpu")]
{
let gpu_tensor = cpu_tensor.to_backend(BackendType::Wgpu)?;
println!("Converted to GPU (both CPU and GPU memory used)");
// Process on GPU
let gpu_result = (&gpu_tensor * 2.0) + 1.0;
println!("Processed on GPU");
// Convert back to CPU (allocates new CPU memory)
let final_result = gpu_result.to_backend(BackendType::Cpu)?;
println!("Converted back to CPU");
// At this point: original CPU tensor, GPU tensor, and final CPU tensor exist
// Memory is automatically freed when variables go out of scope
let summary = final_result.sum(None)?;
println!("Final summary: {}", summary.to_vec()?[0]);
}
println!("Memory automatically freed when variables go out of scope");
Ok(())
}
Production Patterns
Configuration-Driven Backend Selection
use std::env;
#[derive(Debug)]
struct TensorConfig {
preferred_backend: BackendType,
fallback_backends: Vec<BackendType>,
small_tensor_threshold: usize,
}
impl TensorConfig {
fn from_env() -> Self {
let preferred = env::var("TENSOR_BACKEND")
.unwrap_or_else(|_| "auto".to_string());
let preferred_backend = match preferred.as_str() {
"cpu" => BackendType::Cpu,
#[cfg(feature = "wgpu")]
"wgpu" => BackendType::Wgpu,
#[cfg(feature = "cuda")]
"cuda" => BackendType::Cuda,
_ => {
// Auto-select best available
#[cfg(feature = "cuda")]
{ BackendType::Cuda }
#[cfg(all(feature = "wgpu", not(feature = "cuda")))]
{ BackendType::Wgpu }
#[cfg(all(not(feature = "wgpu"), not(feature = "cuda")))]
{ BackendType::Cpu }
}
};
let threshold = env::var("SMALL_TENSOR_THRESHOLD")
.unwrap_or_else(|_| "10000".to_string())
.parse()
.unwrap_or(10000);
TensorConfig {
preferred_backend,
fallback_backends: vec![BackendType::Cpu], // Always fallback to CPU
small_tensor_threshold: threshold,
}
}
fn select_backend(&self, tensor_size: usize) -> BackendType {
if tensor_size < self.small_tensor_threshold {
BackendType::Cpu // Always use CPU for small tensors
} else {
self.preferred_backend
}
}
}
fn production_backend_usage() -> Result<()> {
println!("=== Production Backend Usage ===\n");
let config = TensorConfig::from_env();
println!("Configuration: {:?}", config);
// Use configuration for tensor operations
let sizes = vec![100, 1000, 10000, 100000];
for size in sizes {
let tensor = Tensor::ones(vec![size])?;
let elements = tensor.numel();
let backend = config.select_backend(elements);
let optimized_tensor = tensor.to_backend(backend)?;
println!("Tensor size {}: using {:?} backend",
elements, optimized_tensor.backend_type());
}
Ok(())
}
Application-Level Backend Strategy
struct TensorApplication {
config: TensorConfig,
}
impl TensorApplication {
fn new() -> Self {
Self {
config: TensorConfig::from_env(),
}
}
fn process_data(&self, data: Vec<f32>, shape: Vec<usize>) -> Result<Tensor> {
// Create tensor
let tensor = Tensor::from_vec(data, shape)?;
// Select optimal backend
let backend = self.config.select_backend(tensor.numel());
let optimized_tensor = tensor.to_backend(backend)?;
// Perform operations
let processed = (&optimized_tensor * 2.0) + 1.0;
let normalized = &processed / processed.sum(None)?;
Ok(normalized)
}
fn batch_process(&self, batches: Vec<Vec<f32>>, shape: Vec<usize>) -> Result<Vec<Tensor>> {
batches
.into_iter()
.map(|batch| self.process_data(batch, shape.clone()))
.collect()
}
}
Best Practices Summary
1. Size-Based Selection
- Small tensors (< 10K elements): Use CPU backend
- Medium tensors (10K - 1M elements): Consider WGPU
- Large tensors (> 1M elements): Prefer CUDA > WGPU > CPU
2. Operation-Based Selection
- I/O operations: Use CPU backend
- Element-wise operations: Use GPU backends for large tensors
- Reductions: GPU effective for very large tensors
- Large reductions: CUDA > CPU > WGPU (until WGPU reductions implemented)
3. Memory Management
- Convert to target backend early in pipeline
- Avoid frequent backend conversions
- Use batch processing when possible
- Monitor memory usage in production
4. Error Handling
- Always provide CPU fallback
- Handle backend-specific errors gracefully
- Use configuration for backend preferences
- Test with all available backends
5. Performance Optimization
- Benchmark with your specific workload
- Consider warmup time for GPU backends
- Profile memory transfer overhead
- Use appropriate tensor sizes for each backend
Next Steps
- Performance Guide - Advanced optimization techniques
- API Reference - Detailed backend API documentation
- Backend-Specific Guides - Deep dives into each backend