Advanced Level
Performance Optimization
Chapter 24: Performance Optimization 🚀
Optimize Rust code for maximum performance while maintaining safety and readability.
Zero-Cost Abstractions
Iterator Efficiency
Rust's iterators are zero-cost abstractions - they compile down to the same code as manual loops:
// High-level iterator approach
fn sum_of_squares_iter(nums: &[i32]) -> i32 {
nums.iter().map(|x| x * x).sum()
}
// Low-level manual loop approach
fn sum_of_squares_loop(nums: &[i32]) -> i32 {
let mut sum = 0;
for i in 0..nums.len() {
sum += nums[i] * nums[i];
}
sum
}
fn main() {
let numbers = vec![1, 2, 3, 4, 5];
println!("Iterator result: {}", sum_of_squares_iter(&numbers));
println!("Loop result: {}", sum_of_squares_loop(&numbers));
// Both compile to identical assembly code
}Memory Layout Optimization
Struct Field Reordering
// Inefficient memory layout (larger due to padding)
struct Inefficient {
a: u8, // 1 byte + 7 bytes padding
b: u64, // 8 bytes
c: u8, // 1 byte + 7 bytes padding
}
// Efficient memory layout
struct Efficient {
b: u64, // 8 bytes
a: u8, // 1 byte
c: u8, // 1 byte + 6 bytes padding
}
fn main() {
println!("Inefficient struct size: {}", std::mem::size_of::<Inefficient>());
println!("Efficient struct size: {}", std::mem::size_of::<Efficient>());
}Using repr(packed)
#[repr(packed)]
struct Packed {
a: u8,
b: u32,
c: u8,
}
fn main() {
let packed = Packed { a: 1, b: 2, c: 3 };
// println!("{}", packed.b); // Error: reference to packed field
// Safe access
let b = packed.b;
println!("b: {}", b);
println!("Packed struct size: {}", std::mem::size_of::<Packed>());
}Avoiding Unnecessary Allocations
String Building
// Inefficient - multiple allocations
fn build_string_inefficient() -> String {
let mut s = String::new();
s.push_str("Hello");
s.push_str(", ");
s.push_str("world");
s.push_str("!");
s
}
// Efficient - pre-allocate
fn build_string_efficient() -> String {
let mut s = String::with_capacity(13); // "Hello, world!" is 13 chars
s.push_str("Hello");
s.push_str(", ");
s.push_str("world");
s.push_str("!");
s
}
fn main() {
let s1 = build_string_inefficient();
let s2 = build_string_efficient();
println!("String 1: {}", s1);
println!("String 2: {}", s2);
}Vector Pre-allocation
// Inefficient - multiple reallocations
fn build_vector_inefficient() -> Vec<i32> {
let mut v = Vec::new();
for i in 0..1000 {
v.push(i);
}
v
}
// Efficient - pre-allocate
fn build_vector_efficient() -> Vec<i32> {
let mut v = Vec::with_capacity(1000);
for i in 0..1000 {
v.push(i);
}
v
}
fn main() {
let v1 = build_vector_inefficient();
let v2 = build_vector_efficient();
println!("Vector 1 length: {}", v1.len());
println!("Vector 2 length: {}", v2.len());
}Using SmallVec and Other Optimizations
SmallVec for Small Collections
// Add to Cargo.toml:
// [dependencies]
// smallvec = "1.0"
use smallvec::{SmallVec, smallvec};
fn main() {
// Stack-allocated for small vectors
let mut v: SmallVec<[i32; 4]> = smallvec![1, 2, 3, 4];
// Heap-allocated when needed
v.push(5);
println!("{:?}", v);
}Compiler Optimizations
Release Mode
# Debug mode (default) - no optimizations
cargo build
# Release mode - optimizations enabled
cargo build --release
# Run in debug mode
cargo run
# Run in release mode
cargo run --releaseOptimization Levels
In Cargo.toml:
[profile.dev]
opt-level = 0 # No optimizations
[profile.release]
opt-level = 3 # Full optimizationsBenchmarking
Using Criterion for Benchmarking
In Cargo.toml:
[dev-dependencies]
criterion = "0.5"
[[bench]]
name = "my_benchmark"
harness = falseIn benches/my_benchmark.rs:
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
fn fibonacci(n: u64) -> u64 {
match n {
0 => 1,
1 => 1,
n => fibonacci(n-1) + fibonacci(n-2),
}
}
fn fibonacci_fast(n: u64) -> u64 {
let mut a = 0;
let mut b = 1;
match n {
0 => b,
_ => {
for _ in 0..n {
let c = a + b;
a = b;
b = c;
}
b
}
}
}
fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("fibonacci 20", |b| b.iter(|| fibonacci(black_box(20))));
c.bench_function("fibonacci fast 20", |b| b.iter(|| fibonacci_fast(black_box(20))));
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);Run benchmarks:
cargo benchUnsafe for Performance
When Safe Code Isn't Fast Enough
// Safe but potentially slower
fn safe_sum(numbers: &[i32]) -> i32 {
numbers.iter().sum()
}
// Potentially faster with unsafe
fn unsafe_sum(numbers: &[i32]) -> i32 {
let mut sum = 0;
for i in 0..numbers.len() {
sum += unsafe { *numbers.get_unchecked(i) };
}
sum
}
fn main() {
let numbers = vec![1, 2, 3, 4, 5];
println!("Safe sum: {}", safe_sum(&numbers));
println!("Unsafe sum: {}", unsafe_sum(&numbers));
}Parallel Processing
Using Rayon for Easy Parallelization
In Cargo.toml:
[dependencies]
rayon = "1.0"use rayon::prelude::*;
fn sequential_map(numbers: &[i32]) -> Vec<i32> {
numbers.iter().map(|x| x * x).collect()
}
fn parallel_map(numbers: &[i32]) -> Vec<i32> {
numbers.par_iter().map(|x| x * x).collect()
}
fn main() {
let numbers: Vec<i32> = (0..1000).collect();
let seq_result = sequential_map(&numbers);
let par_result = parallel_map(&numbers);
println!("Sequential result length: {}", seq_result.len());
println!("Parallel result length: {}", par_result.len());
}Practical Examples
Example 1: Optimized Data Processing Pipeline
use std::collections::HashMap;
// Inefficient version
fn process_data_inefficient(data: Vec<i32>) -> HashMap<i32, i32> {
let mut result = HashMap::new();
for item in data {
if item > 0 {
result.insert(item, item * item);
}
}
result
}
// Efficient version
fn process_data_efficient(data: Vec<i32>) -> HashMap<i32, i32> {
let mut result = HashMap::with_capacity(data.len());
for item in data {
if item > 0 {
result.insert(item, item * item);
}
}
result
}
// Ultra-efficient version with unsafe
fn process_data_ultra_efficient(data: Vec<i32>) -> HashMap<i32, i32> {
let mut result = HashMap::with_capacity(data.len());
for i in 0..data.len() {
let item = unsafe { *data.get_unchecked(i) };
if item > 0 {
result.insert(item, item * item);
}
}
result
}
fn main() {
let data: Vec<i32> = (0..1000).collect();
let result1 = process_data_inefficient(data.clone());
let result2 = process_data_efficient(data.clone());
let result3 = process_data_ultra_efficient(data);
println!("Result 1 length: {}", result1.len());
println!("Result 2 length: {}", result2.len());
println!("Result 3 length: {}", result3.len());
}Example 2: Custom Memory Allocator
use std::alloc::{GlobalAlloc, Layout, System};
use std::ptr;
use std::sync::atomic::{AtomicUsize, Ordering};
struct CounterAllocator;
static ALLOCATED: AtomicUsize = AtomicUsize::new(0);
unsafe impl GlobalAlloc for CounterAllocator {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
let ret = System.alloc(layout);
if !ret.is_null() {
ALLOCATED.fetch_add(layout.size(), Ordering::SeqCst);
}
ret
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
System.dealloc(ptr, layout);
ALLOCATED.fetch_sub(layout.size(), Ordering::SeqCst);
}
}
#[global_allocator]
static A: CounterAllocator = CounterAllocator;
fn main() {
let mut v = Vec::new();
for i in 0..1000 {
v.push(i);
}
println!("Allocated bytes: {}", ALLOCATED.load(Ordering::SeqCst));
// Clear the vector
v.clear();
println!("Allocated bytes after clear: {}", ALLOCATED.load(Ordering::SeqCst));
}Profiling Tools
Using perf on Linux
# Build in release mode with debug info
cargo build --release
# Profile your application
perf record -g target/release/your-app
# View the results
perf reportUsing Instruments on macOS
# Build in release mode
cargo build --release
# Profile with Instruments
cargo instruments --release --example your-exampleCommon Mistakes
❌ Premature Optimization
// Don't do this - optimize only when needed
fn over_optimized_sum(numbers: &[i32]) -> i32 {
let mut sum = 0;
let mut i = 0;
// Manual loop unrolling - unnecessary complexity
while i < numbers.len() - 3 {
sum += unsafe { *numbers.get_unchecked(i) };
sum += unsafe { *numbers.get_unchecked(i + 1) };
sum += unsafe { *numbers.get_unchecked(i + 2) };
sum += unsafe { *numbers.get_unchecked(i + 3) };
i += 4;
}
while i < numbers.len() {
sum += unsafe { *numbers.get_unchecked(i) };
i += 1;
}
sum
}✅ Readable and Efficient Code
// This is better - readable and efficient
fn optimized_sum(numbers: &[i32]) -> i32 {
numbers.iter().sum()
}❌ Ignoring Compiler Optimizations
// Debug builds are slow - always benchmark release builds
fn slow_function() -> Vec<i32> {
let mut v = Vec::new();
for i in 0..1000000 {
v.push(i);
}
v
}
// In debug mode: slow
// In release mode: fast due to compiler optimizationsKey Takeaways
- ✅ Rust's zero-cost abstractions compile to efficient code
- ✅ Struct field ordering affects memory usage
- ✅ Pre-allocating collections can improve performance
- ✅ Use release mode (
--release) for performance testing - ✅ Benchmark with Criterion to measure actual performance
- ✅ Parallel processing with Rayon can significantly speed up CPU-bound tasks
- ✅ Only use unsafe code when necessary and after profiling
- ✅ Profile your code to identify actual bottlenecks
- ✅ Follow Rust naming conventions and idioms for performance optimization
🎉 Congratulations! You've completed all 24 chapters of our comprehensive Rust tutorial. Now you're ready to tackle practical projects and become a Rust expert!