Advanced Level

Performance Optimization

Chapter 24: Performance Optimization 🚀

Optimize Rust code for maximum performance while maintaining safety and readability.

Zero-Cost Abstractions

Iterator Efficiency

Rust's iterators are zero-cost abstractions - they compile down to the same code as manual loops:

// High-level iterator approach
fn sum_of_squares_iter(nums: &[i32]) -> i32 {
    nums.iter().map(|x| x * x).sum()
}

// Low-level manual loop approach
fn sum_of_squares_loop(nums: &[i32]) -> i32 {
    let mut sum = 0;
    for i in 0..nums.len() {
        sum += nums[i] * nums[i];
    }
    sum
}

fn main() {
    let numbers = vec![1, 2, 3, 4, 5];
    
    println!("Iterator result: {}", sum_of_squares_iter(&numbers));
    println!("Loop result: {}", sum_of_squares_loop(&numbers));
    
    // Both compile to identical assembly code
}

Memory Layout Optimization

Struct Field Reordering

// Inefficient memory layout (larger due to padding)
struct Inefficient {
    a: u8,    // 1 byte + 7 bytes padding
    b: u64,  // 8 bytes
    c: u8,   // 1 byte + 7 bytes padding
}

// Efficient memory layout
struct Efficient {
    b: u64,  // 8 bytes
    a: u8,   // 1 byte
    c: u8,   // 1 byte + 6 bytes padding
}

fn main() {
    println!("Inefficient struct size: {}", std::mem::size_of::<Inefficient>());
    println!("Efficient struct size: {}", std::mem::size_of::<Efficient>());
}

Using repr(packed)

#[repr(packed)]
struct Packed {
    a: u8,
    b: u32,
    c: u8,
}

fn main() {
    let packed = Packed { a: 1, b: 2, c: 3 };
    // println!("{}", packed.b); // Error: reference to packed field
    
    // Safe access
    let b = packed.b;
    println!("b: {}", b);
    
    println!("Packed struct size: {}", std::mem::size_of::<Packed>());
}

Avoiding Unnecessary Allocations

String Building

// Inefficient - multiple allocations
fn build_string_inefficient() -> String {
    let mut s = String::new();
    s.push_str("Hello");
    s.push_str(", ");
    s.push_str("world");
    s.push_str("!");
    s
}

// Efficient - pre-allocate
fn build_string_efficient() -> String {
    let mut s = String::with_capacity(13); // "Hello, world!" is 13 chars
    s.push_str("Hello");
    s.push_str(", ");
    s.push_str("world");
    s.push_str("!");
    s
}

fn main() {
    let s1 = build_string_inefficient();
    let s2 = build_string_efficient();
    
    println!("String 1: {}", s1);
    println!("String 2: {}", s2);
}

Vector Pre-allocation

// Inefficient - multiple reallocations
fn build_vector_inefficient() -> Vec<i32> {
    let mut v = Vec::new();
    for i in 0..1000 {
        v.push(i);
    }
    v
}

// Efficient - pre-allocate
fn build_vector_efficient() -> Vec<i32> {
    let mut v = Vec::with_capacity(1000);
    for i in 0..1000 {
        v.push(i);
    }
    v
}

fn main() {
    let v1 = build_vector_inefficient();
    let v2 = build_vector_efficient();
    
    println!("Vector 1 length: {}", v1.len());
    println!("Vector 2 length: {}", v2.len());
}

Using SmallVec and Other Optimizations

SmallVec for Small Collections

// Add to Cargo.toml:
// [dependencies]
// smallvec = "1.0"

use smallvec::{SmallVec, smallvec};

fn main() {
    // Stack-allocated for small vectors
    let mut v: SmallVec<[i32; 4]> = smallvec![1, 2, 3, 4];
    
    // Heap-allocated when needed
    v.push(5);
    
    println!("{:?}", v);
}

Compiler Optimizations

Release Mode

# Debug mode (default) - no optimizations
cargo build

# Release mode - optimizations enabled
cargo build --release

# Run in debug mode
cargo run

# Run in release mode
cargo run --release

Optimization Levels

In Cargo.toml:

[profile.dev]
opt-level = 0  # No optimizations

[profile.release]
opt-level = 3  # Full optimizations

Benchmarking

Using Criterion for Benchmarking

In Cargo.toml:

[dev-dependencies]
criterion = "0.5"

[[bench]]
name = "my_benchmark"
harness = false

In benches/my_benchmark.rs:

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

fn fibonacci(n: u64) -> u64 {
    match n {
        0 => 1,
        1 => 1,
        n => fibonacci(n-1) + fibonacci(n-2),
    }
}

fn fibonacci_fast(n: u64) -> u64 {
    let mut a = 0;
    let mut b = 1;
    match n {
        0 => b,
        _ => {
            for _ in 0..n {
                let c = a + b;
                a = b;
                b = c;
            }
            b
        }
    }
}

fn criterion_benchmark(c: &mut Criterion) {
    c.bench_function("fibonacci 20", |b| b.iter(|| fibonacci(black_box(20))));
    c.bench_function("fibonacci fast 20", |b| b.iter(|| fibonacci_fast(black_box(20))));
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Run benchmarks:

cargo bench

Unsafe for Performance

When Safe Code Isn't Fast Enough

// Safe but potentially slower
fn safe_sum(numbers: &[i32]) -> i32 {
    numbers.iter().sum()
}

// Potentially faster with unsafe
fn unsafe_sum(numbers: &[i32]) -> i32 {
    let mut sum = 0;
    for i in 0..numbers.len() {
        sum += unsafe { *numbers.get_unchecked(i) };
    }
    sum
}

fn main() {
    let numbers = vec![1, 2, 3, 4, 5];
    
    println!("Safe sum: {}", safe_sum(&numbers));
    println!("Unsafe sum: {}", unsafe_sum(&numbers));
}

Parallel Processing

Using Rayon for Easy Parallelization

In Cargo.toml:

[dependencies]
rayon = "1.0"
use rayon::prelude::*;

fn sequential_map(numbers: &[i32]) -> Vec<i32> {
    numbers.iter().map(|x| x * x).collect()
}

fn parallel_map(numbers: &[i32]) -> Vec<i32> {
    numbers.par_iter().map(|x| x * x).collect()
}

fn main() {
    let numbers: Vec<i32> = (0..1000).collect();
    
    let seq_result = sequential_map(&numbers);
    let par_result = parallel_map(&numbers);
    
    println!("Sequential result length: {}", seq_result.len());
    println!("Parallel result length: {}", par_result.len());
}

Practical Examples

Example 1: Optimized Data Processing Pipeline

use std::collections::HashMap;

// Inefficient version
fn process_data_inefficient(data: Vec<i32>) -> HashMap<i32, i32> {
    let mut result = HashMap::new();
    
    for item in data {
        if item > 0 {
            result.insert(item, item * item);
        }
    }
    
    result
}

// Efficient version
fn process_data_efficient(data: Vec<i32>) -> HashMap<i32, i32> {
    let mut result = HashMap::with_capacity(data.len());
    
    for item in data {
        if item > 0 {
            result.insert(item, item * item);
        }
    }
    
    result
}

// Ultra-efficient version with unsafe
fn process_data_ultra_efficient(data: Vec<i32>) -> HashMap<i32, i32> {
    let mut result = HashMap::with_capacity(data.len());
    
    for i in 0..data.len() {
        let item = unsafe { *data.get_unchecked(i) };
        if item > 0 {
            result.insert(item, item * item);
        }
    }
    
    result
}

fn main() {
    let data: Vec<i32> = (0..1000).collect();
    
    let result1 = process_data_inefficient(data.clone());
    let result2 = process_data_efficient(data.clone());
    let result3 = process_data_ultra_efficient(data);
    
    println!("Result 1 length: {}", result1.len());
    println!("Result 2 length: {}", result2.len());
    println!("Result 3 length: {}", result3.len());
}

Example 2: Custom Memory Allocator

use std::alloc::{GlobalAlloc, Layout, System};
use std::ptr;
use std::sync::atomic::{AtomicUsize, Ordering};

struct CounterAllocator;

static ALLOCATED: AtomicUsize = AtomicUsize::new(0);

unsafe impl GlobalAlloc for CounterAllocator {
    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
        let ret = System.alloc(layout);
        if !ret.is_null() {
            ALLOCATED.fetch_add(layout.size(), Ordering::SeqCst);
        }
        ret
    }
    
    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
        System.dealloc(ptr, layout);
        ALLOCATED.fetch_sub(layout.size(), Ordering::SeqCst);
    }
}

#[global_allocator]
static A: CounterAllocator = CounterAllocator;

fn main() {
    let mut v = Vec::new();
    for i in 0..1000 {
        v.push(i);
    }
    
    println!("Allocated bytes: {}", ALLOCATED.load(Ordering::SeqCst));
    
    // Clear the vector
    v.clear();
    
    println!("Allocated bytes after clear: {}", ALLOCATED.load(Ordering::SeqCst));
}

Profiling Tools

Using perf on Linux

# Build in release mode with debug info
cargo build --release

# Profile your application
perf record -g target/release/your-app

# View the results
perf report

Using Instruments on macOS

# Build in release mode
cargo build --release

# Profile with Instruments
cargo instruments --release --example your-example

Common Mistakes

❌ Premature Optimization

// Don't do this - optimize only when needed
fn over_optimized_sum(numbers: &[i32]) -> i32 {
    let mut sum = 0;
    let mut i = 0;
    
    // Manual loop unrolling - unnecessary complexity
    while i < numbers.len() - 3 {
        sum += unsafe { *numbers.get_unchecked(i) };
        sum += unsafe { *numbers.get_unchecked(i + 1) };
        sum += unsafe { *numbers.get_unchecked(i + 2) };
        sum += unsafe { *numbers.get_unchecked(i + 3) };
        i += 4;
    }
    
    while i < numbers.len() {
        sum += unsafe { *numbers.get_unchecked(i) };
        i += 1;
    }
    
    sum
}

✅ Readable and Efficient Code

// This is better - readable and efficient
fn optimized_sum(numbers: &[i32]) -> i32 {
    numbers.iter().sum()
}

❌ Ignoring Compiler Optimizations

// Debug builds are slow - always benchmark release builds
fn slow_function() -> Vec<i32> {
    let mut v = Vec::new();
    for i in 0..1000000 {
        v.push(i);
    }
    v
}

// In debug mode: slow
// In release mode: fast due to compiler optimizations

Key Takeaways

  • ✅ Rust's zero-cost abstractions compile to efficient code
  • ✅ Struct field ordering affects memory usage
  • ✅ Pre-allocating collections can improve performance
  • ✅ Use release mode (--release) for performance testing
  • ✅ Benchmark with Criterion to measure actual performance
  • ✅ Parallel processing with Rayon can significantly speed up CPU-bound tasks
  • ✅ Only use unsafe code when necessary and after profiling
  • ✅ Profile your code to identify actual bottlenecks
  • ✅ Follow Rust naming conventions and idioms for performance optimization

🎉 Congratulations! You've completed all 24 chapters of our comprehensive Rust tutorial. Now you're ready to tackle practical projects and become a Rust expert!

🦀 Rust Programming Tutorial

Learn from Zero to Advanced

Built with Next.js and Tailwind CSS • Open Source