Skip to content
v1.0.0-zig0.15.2

Fork-Join

Execute multiple tasks potentially in parallel and collect results. This is the foundation of divide-and-conquer parallelism.

const blitz = @import("blitz");
fn computeA(n: u32) u64 {
var sum: u64 = 0;
for (0..n) |i| sum += i * 2;
return sum;
}
fn computeB(n: u32) u64 {
var sum: u64 = 0;
for (0..n) |i| sum += i * 3;
return sum;
}
pub fn main() void {
// Join two tasks - B may run in parallel with A
const result = blitz.join(.{
.a = .{ computeA, @as(u32, 1_000_000) },
.b = .{ computeB, @as(u32, 1_000_000) },
});
// Access results by field name
const total = result.a + result.b;
}
┌─────────────────────────────────────────────────┐
│ join(.{.a, .b}) │
├─────────────────────────────────────────────────┤
│ │
│ 1. Fork: Push B to deque (may be stolen) │
│ ┌─────┐ │
│ │ B │ ◄── visible to other workers │
│ └─────┘ │
│ │
│ 2. Execute A locally │
│ ┌─────┐ │
│ │ A │ ◄── runs on calling thread │
│ └─────┘ │
│ │
│ 3. Join: Get B's result │
│ - If B not stolen: pop and run locally │
│ - If B stolen: wait for thief to finish │
│ │
│ 4. Return .{ .a = resultA, .b = resultB } │
│ │
└─────────────────────────────────────────────────┘
const result = blitz.join(.{
.left = .{ computeLeft, left_data },
.right = .{ computeRight, right_data },
});
// Access: result.left, result.right
const result = blitz.join(.{
.first = .{ task1, arg1 },
.second = .{ task2, arg2 },
.third = .{ task3, arg3 },
});
// Access: result.first, result.second, result.third
const result = blitz.join(.{
.a = .{ taskA, argA },
.b = .{ taskB, argB },
.c = .{ taskC, argC },
.d = .{ taskD, argD },
// ... up to 8 tasks
});

Functions can return different types:

fn processStrings(strings: []const []const u8) usize {
return strings.len;
}
fn processNumbers(numbers: []const i64) i64 {
var sum: i64 = 0;
for (numbers) |n| sum += n;
return sum;
}
const result = blitz.join(.{
.count = .{ processStrings, strings }, // Returns usize
.sum = .{ processNumbers, numbers }, // Returns i64
});
// result.count: usize
// result.sum: i64

For functions that don’t return values, the result fields are void:

fn processChunkA(chunk: []u8) void {
for (chunk) |*c| c.* *= 2;
}
fn processChunkB(chunk: []u8) void {
for (chunk) |*c| c.* += 1;
}
_ = blitz.join(.{
.a = .{ processChunkA, chunk_a },
.b = .{ processChunkB, chunk_b },
});
// Both complete before returning
fn parallelFib(n: u64) u64 {
// CRITICAL: Sequential threshold prevents overhead explosion
if (n < 20) return fibSequential(n);
const r = blitz.join(.{
.a = .{ parallelFib, n - 1 },
.b = .{ parallelFib, n - 2 },
});
return r.a + r.b;
}
fn fibSequential(n: u64) u64 {
if (n <= 1) return n;
return fibSequential(n - 1) + fibSequential(n - 2);
}

Performance (fib(45) on 10 cores):

MethodTimeSpeedup
Sequential635 ms1.0x
Parallel (threshold=20)93 ms6.9x
fn parallelMergeSort(data: []i64, temp: []i64) void {
// Sequential for small arrays
if (data.len <= 1024) {
std.mem.sort(i64, data, {}, std.sort.asc(i64));
return;
}
const mid = data.len / 2;
// Sort halves in parallel
_ = blitz.join(.{
.left = .{ struct {
fn sort(args: struct { d: []i64, t: []i64 }) void {
parallelMergeSort(args.d, args.t);
}
}.sort, .{ .d = data[0..mid], .t = temp[0..mid] } },
.right = .{ struct {
fn sort(args: struct { d: []i64, t: []i64 }) void {
parallelMergeSort(args.d, args.t);
}
}.sort, .{ .d = data[mid..], .t = temp[mid..] } },
});
// Merge sorted halves
merge(data[0..mid], data[mid..], temp);
@memcpy(data, temp);
}
fn parallelSum(data: []const i64) i64 {
if (data.len < 1000) {
var sum: i64 = 0;
for (data) |v| sum += v;
return sum;
}
const mid = data.len / 2;
const r = blitz.join(.{
.left = .{ parallelSum, data[0..mid] },
.right = .{ parallelSum, data[mid..] },
});
return r.left + r.right;
}

Always set a sequential threshold in recursive fork-join:

// GOOD: Has threshold
fn goodFib(n: u64) u64 {
if (n < 20) return fibSeq(n); // ← Threshold!
const r = blitz.join(.{...});
return r.a + r.b;
}
// BAD: No threshold - exponential overhead
fn badFib(n: u64) u64 {
if (n <= 1) return n;
const r = blitz.join(.{...}); // ← Spawns task even for n=2!
return r.a + r.b;
}

Threshold guidelines:

AlgorithmRecommended Threshold
Fibonaccin < 20
Merge sortlen < 1024
Tree operationsnodes < 100-1000
GeneralSwitch when overhead > work
  • Recursive divide-and-conquer: sort, fibonacci, tree traversal
  • Two independent computations: results don’t depend on each other
  • Heterogeneous tasks: different operations, different types
  • Nested parallelism: recursive algorithms that spawn subtasks
  • Many small independent tasks: use parallelFor instead
  • Sequential dependencies: one task needs another’s result
  • Very unbalanced workloads: one task much larger than others
  • Single elements: overhead exceeds benefit
Use CaseBest ChoiceWhy
Process array elementsiter().forEach()Optimized for data parallelism
Sum/min/max arrayiter().sum()Parallel reduction
Two independent tasksjoin(.{...})Named results
Recursive tree structurejoin(.{...})Natural recursion
Transform elementsiterMut().mapInPlace()In-place, parallel
Search for elementiter().findAny()Early exit
OperationTypical TimeNotes
Fork (push to deque)~3 nsWait-free
Join (local, not stolen)~5 nsPop from deque
Join (stolen, complete)~3 nsLatch check only
Join (stolen, waiting)~10-50 nsWork-stealing while waiting
Full fork-join cycle~10-20 nsAmortized
Fork-Join Scaling (10 workers, fib(n)):
┌──────────────────────────────────────────┐
│ Depth 10: 10.54 ms (1M forks) │
│ Depth 15: 1.29 ms │
│ Depth 20: 0.54 ms (1M forks) │
│ Scaling: 2.4x per 5 depth levels │
└──────────────────────────────────────────┘
fn process(data: []T) Result {
if (data.len < threshold) return processSeq(data);
const mid = data.len / 2;
const r = blitz.join(.{
.left = .{ process, data[0..mid] },
.right = .{ process, data[mid..] },
});
return combine(r.left, r.right);
}

Pattern 2: Multiple Operations on Same Data

Section titled “Pattern 2: Multiple Operations on Same Data”
const stats = blitz.join(.{
.sum = .{ computeSum, data },
.variance = .{ computeVariance, data },
.histogram = .{ computeHistogram, data },
});
// Stage 1: Load in parallel
const r1 = blitz.join(.{
.a = .{ loadFile, "data1.bin" },
.b = .{ loadFile, "data2.bin" },
});
// Stage 2: Process in parallel
const r2 = blitz.join(.{
.a = .{ process, r1.a },
.b = .{ process, r1.b },
});
// Stage 3: Combine
const final = merge(r2.a, r2.b);
  1. Stack-allocated futures: Deep recursion can overflow stack
  2. No cancellation: Once forked, tasks run to completion
  3. No priority: All tasks have equal priority
  4. Fixed parallelism: Can’t dynamically adjust task count
  • Error Handling — Use tryJoin() when your tasks return error unions. All tasks complete before errors propagate.
  • Scope & Broadcast — Use scope() for dynamic task spawning (up to 64 tasks) and broadcast() to run on all workers.
  • Choosing the Right API — Decision guide for when to use join() vs parallelFor vs iterators.
  1. Always set sequential thresholds in recursive code
  2. Name fields meaningfully: .left/.right, .sum/.count, etc.
  3. Keep tasks balanced: Similar work in each branch
  4. Measure before optimizing: Profile to find the right threshold
  5. Use iterators for data parallelism: Fork-join is for task parallelism