// Performance without compromise, Safety without overhead
Why 1% matters: Real server counts, language performance comparison
GC overhead, compaction, metadata, write barriers, cache misses
Smart pointers, coroutines, ranges, zero-cost abstractions
The hidden price of automatic memory management:
What developers pay for safety and convenience in C#, Go, and Java
✓ Fast GC scan
✗ 12 bytes overhead
✓ Lower per-object overhead
✗ Span metadata + CPU lookup
✗ Still overhead vs C++
// C++: No runtime type info
class ListNode {
int data;
ListNode* next;
ListNode* prev;
}; // 20 bytes - no overhead
// C#: Must track types at runtime
class ListNode {
public int data;
public ListNode next;
public ListNode prev;
} // 32 bytes - 12 bytes overhead!
■ Roots
■ Reachable
■ Garbage
Traversal order: 1→2→3→4→5
if (ptr == old_addr) ptr = new_addr;
✓ No wasted memory
✓ Cache-friendly
✗ Expensive compact
✗ Must pause/barrier
✓ No compaction
✓ Fast allocation
✗ ~10-20% waste
✗ Less cache-friendly
State: A → B → D, C (separate, unmarked) 1. GC marks A ✓, B ✓ 2. GC pauses to scan elsewhere... 3. App writes: B.ref = C (new reference!) 4. GC resumes, finishes B (doesn't see C!) Result: C never marked → COLLECTED! 💥
// Your code:
B.ref = C;
// Runtime automatically inserts barrier:
B.ref = C; // 1. Do write
if (atomic_load(gc_state) == MARKING) { // 2. Check GC
atomic_thread_fence(release); // 3. Memory fence
mark_object(C); // 4. Tell GC about C
}
1. Check gc_state ~2-5 cycles (atomic read from shared variable) 2. Memory fence ~20-200 cycles ⚠️ (flush CPU caches, ensure visibility) 3. Mark object ~10-50 cycles (add to GC worklist, update metadata) Total: 30-250 cycles per write vs C++ without GC: ~1 cycle
// Array of references → scattered
Point[] points = new Point[1000];
for (int i = 0; i < points.Length; i++) {
points[i].X += 1; // Cache miss!
}
// Contiguous values
std::vector<Point> points(1000);
for (auto& p : points) {
p.x += 1; // Cache hit!
}
// Go interface = vtable lookup
type Serializer interface {
Serialize(data []byte) string
}
func processMany(items []Serializer) {
for _, item := range items {
item.Serialize(data) // Vtable lookup each time
}
}
// Cost: ~5-10 cycles per call (vtable + indirect jump)
// ✗ Can't inline, CPU can't predict virtual calls
// Template: Type known at compile time
template<typename T>
void processMany(std::vector<T>& items) {
for (auto& item : items) {
item.serialize(data); // No vtable lookup!
}
}
// Compiler generates specialized code for each type:
processMany(vector<JsonSerializer>);
item.serialize() → { /* actual serialize code here */ }for(4x) → process item[0], item[1], item[2], item[3]One CPU instruction → operates on 4+ items in parallelcall 0x12345 vs load vtable → load function ptr → call
// Manual memory management
void process_request(const string& json_data) {
// Allocate large buffer
char* buffer = new char[1024 * 1024];
// Parse and process
if (!parse_json(json_data, buffer)) {
delete[] buffer; // Don't forget!
return;
}
// Validate
if (!validate(buffer)) {
delete[] buffer; // Don't forget again!
return;
}
// Send response
send_response(buffer);
delete[] buffer; // Easy to forget!
// What if exception thrown? → LEAK!
}
// Automatic memory management
void process_request(string_view json_data) {
// Allocate large buffer (unique ownership)
auto buffer = make_unique<char[]>(1024 * 1024);
// Alt: unique_ptr<char[]> buf(new char[1024*1024]);
// For shared: make_shared<char[]>(1024 * 1024);
// Parse and process
if (!parse_json(json_data, buffer.get())) {
return; // Auto-deleted!
}
// Validate
if (!validate(buffer.get())) {
return; // Auto-deleted!
}
// Send response
send_response(buffer.get());
// Auto-deleted! Even with exceptions!
}
// Old C++: Manual tracking is error-prone
class Worker {
Logger* logger; // Shared by all workers
public:
Worker(Logger* log) : logger(log) {}
void do_work() {
logger->write("Working...");
}
};
void run_tasks() {
auto log = new Logger("app.log");
auto w1 = Worker(log);
auto w2 = Worker(log);
auto w3 = Worker(log);
w1.do_work();
w2.do_work();
delete log; // Safe? What if w3 still running?
}
// Modern C++: Automatic shared ownership
class Worker {
shared_ptr<Logger> logger;
public:
Worker(auto log) : logger(std::move(log)) {}
void do_work() {
logger->write("Working...");
}
};
void run_tasks() {
auto log = make_shared<Logger>("app.log");
auto w1 = Worker(log);
auto w2 = Worker(log);
auto w3 = Worker(log);
w1.do_work();
w2.do_work();
// Logger closed automatically when all workers done
}
// C-style array - fixed size, no bounds checking
int scores[5] = {90, 85, 92, 88, 95};
scores[10] = 100; // Buffer overflow! UB!
// C-style string - manual memory, null terminator
char* name = new char[100];
strcpy(name, "John"); // Unsafe!
delete[] name; // Easy to forget!
// Dynamic array - manual memory
int* data = new int[count];
// ... use it ...
delete[] data; // Easy to forget!
// String view - pointer + length (error-prone)
void process(const char* str, size_t len) {
// Manual bounds checking needed
}
// Array view - pointer + size (C++20)
void process(int* arr, size_t size) {
arr[size + 1] = 42; // Buffer overflow! UB!
}
// std::array - fixed size, bounds checking
std::array<int, 5> scores = {90, 85, 92, 88, 95};
scores.at(10); // Throws exception! Safe!
// std::string - automatic memory management
std::string name = "John";
name += " Doe"; // Safe concatenation
// Automatic cleanup, no delete needed!
// std::vector - dynamic array, automatic memory
std::vector<int> data(count);
data.push_back(42); // Grows automatically
// Automatic cleanup, no delete needed!
// std::string_view - efficient, safe view
void process(std::string_view str) {
// Bounds-checked, no copy, safe
}
// std::span - safe array view (C++20)
void process(std::span<int> arr) {
arr[0] = 42; // Bounds-checked, safe!
// Works with array, vector, C-array!
}
| Type | Use Case | Key Benefits | vs C-Style |
|---|---|---|---|
std::array<T,N> |
Fixed-size arrays | Bounds checking, size tracking, zero overhead | vs T arr[N] ✓ Safe |
std::vector<T> |
Dynamic arrays | Auto memory, grows, cache-friendly, fast | vs T* + new[] ✓ No leaks |
std::string |
Text data | Auto memory, SSO optimization, safe ops | vs char* ✓ No overflow |
std::string_view |
Read-only text | Zero-copy view, no allocation, efficient | vs const char* ✓ Knows size |
std::span<T> |
Array view (C++20) | Safe view of any array, bounds-checked | vs T* + size ✓ Unified API |
.data())
// Option 1: Function pointer (limited)
bool compare(int a, int b) {
return a < b;
}
std::sort(data.begin(), data.end(), compare);
// Option 2: Functor (verbose!)
class Comparator {
bool reverse;
public:
Comparator(bool rev) : reverse(rev) {}
bool operator()(int a, int b) const {
return reverse ? a > b : a < b;
}
};
Comparator comp(false); // Normal sort
std::sort(data.begin(), data.end(), comp);
// Can't capture local variables easily!
// Verbose, requires separate class definition
// Simple lambda - inline!
std::sort(data.begin(), data.end(),
[](int a, int b) { return a < b; });
// Lambda with capture - easy!
bool reverse = false;
std::sort(data.begin(), data.end(),
[reverse](int a, int b) {
return reverse ? a > b : a < b;
});
// Real-world: Filter and transform
auto filtered = data
| std::views::filter([](int x) { return x > 0; })
| std::views::transform([](int x) { return x * 2; });
// HTTP handler with lambda
server.route("/api/users", [&db](auto req) {
auto users = db.query("SELECT * FROM users");
return json_response(users);
});
// Filter + Transform: Multiple passes, temporaries
std::vector<int> nums = {1, -2, 3, -4, 5};
// Step 1: Filter positives (allocates temp vector)
std::vector<int> positives;
std::copy_if(nums.begin(), nums.end(),
std::back_inserter(positives),
[](int n) { return n > 0; });
// Step 2: Double them (allocates another vector)
std::vector<int> result;
std::transform(positives.begin(), positives.end(),
std::back_inserter(result),
[](int n) { return n * 2; });
// Problems:
// - Verbose (iterators everywhere)
// - Multiple allocations (2 temp vectors)
// - Eager evaluation (processes all elements)
// - Can't easily compose operations
// Same logic: One line, zero allocations!
std::vector<int> nums = {1, -2, 3, -4, 5};
auto result = nums
| std::views::filter([](int n) { return n > 0; })
| std::views::transform([](int n) { return n * 2; });
// Result: lazy view, no allocation yet!
// Only allocate when needed:
std::vector<int> vec(result.begin(), result.end());
// Benefits:
// - Composable pipelines (Unix pipe style)
// - Zero intermediate allocations
// - Lazy evaluation (process on demand)
// - Works with any range (vector, list, istream, etc.)
// More examples:
auto first3 = nums | std::views::take(3);
auto skip2 = nums | std::views::drop(2);
auto reversed = nums | std::views::reverse;
// Unpacking pairs - verbose
std::map<string, int> users;
auto it = users.find("john");
if (it != users.end()) {
string key = it->first;
int value = it->second;
// Use key and value...
}
// Unpacking tuples - ugly!
std::tuple<int, string, bool> result = query();
int status = std::get<0>(result);
string message = std::get<1>(result);
bool success = std::get<2>(result);
// Iterating map - awkward
for (auto it = users.begin(); it != users.end(); ++it) {
cout << it->first << ": " << it->second;
}
// Struct unpacking - manual
struct Point { int x, y; };
Point p = get_point();
int x = p.x;
int y = p.y;
// Unpacking pairs - clean!
std::map<string, int> users;
if (auto [it, inserted] = users.insert({"john", 42}); inserted) {
auto [key, value] = *it;
// Use key and value directly!
}
// Unpacking tuples - readable!
auto [status, message, success] = query();
// Use status, message, success directly!
// Iterating map - elegant
for (auto& [key, value] : users) {
cout << key << ": " << value;
}
// Struct unpacking - automatic
struct Point { int x, y; };
auto [x, y] = get_point();
// x and y are ready to use!
// Real-world: HTTP parsing
auto [method, path, headers] = parse_request(req);
if (method == "POST" && path == "/api/users") {
auto [auth, content_type] = extract_headers(headers);
}
// Factorial at compile time
constexpr int factorial(int n) {
return n <= 1 ? 1 : n * factorial(n - 1);
}
// Computed at compile time!
constexpr int f10 = factorial(10); // 3628800
// No runtime cost - value baked into binary
// String processing at compile time (C++20)
constexpr auto compile_time_parse(const char* str) {
// Parse, validate, transform at compile time
return /* result */;
}
// Complex example: Compile-time regex
constexpr auto pattern = ctre::match<"[0-9]+">;
// Regex compiled at compile time, not runtime!
// Benefits:
// - Zero runtime overhead
// - Errors caught at compile time
// - Impossible in Go/C# (no compile-time execution)
// Old way: Template errors are cryptic
template<typename T>
void process(T value) {
value.serialize(); // Compile error if no serialize()
}
// Error: 50 lines of template gibberish!
// C++20: Concepts make requirements explicit
template<typename T>
concept Serializable = requires(T t) {
{ t.serialize() } -> std::same_as<std::string>;
};
// Clear constraint, clear errors
void process(Serializable auto value) {
return value.serialize();
}
// Error: "T does not satisfy Serializable" ✓
// Standard concepts:
void sort_items(std::ranges::random_access_range auto& r) {
std::ranges::sort(r);
}
// Only accepts vectors, arrays, etc. - not lists!
// Accept connections
while (true) {
var socket = await listener.AcceptSocketAsync();
// Handle each connection
_ = HandleRequest(socket);
}
// Handle request
async Task HandleRequest(Socket socket) {
using var stream = new NetworkStream(socket);
while (true) {
// Read request (async)
var request = await ReadHttpRequest(stream);
if (request == null) break;
// Process request
var response = MakeResponse(request);
// Write response (async)
await WriteHttpResponse(stream, response);
}
}
// Accept connections
for (;;) {
auto [ec, socket] = co_await acceptor.async_accept(
asio::use_awaitable);
if (!ec) {
// Spawn handler for each connection
asio::co_spawn(executor,
handle_request(std::move(socket)),
asio::detached);
}
}
// Handle request
asio::awaitable<void> handle_request(tcp::socket socket) {
beast::tcp_stream stream(std::move(socket));
for (;;) {
// Read request (async, non-blocking!)
auto [ec, bytes] = co_await beast::http::async_read(
stream, buffer, req, asio::use_awaitable);
if (ec) break;
// Process request
auto response = make_response(req);
// Write response (async!)
co_await beast::http::async_write(
stream, response, asio::use_awaitable);
}
}
Both compile to state machines, but C++ can optimize in specific cases
// Your code:
async Task<int> Fetch(string url) {
var resp = await client.GetAsync(url);
var data = await resp.Content.ReadAsStringAsync();
return data.Length;
}
// Generated state machine:
class FetchStateMachine : IAsyncStateMachine {
int state;
string url;
HttpResponseMessage resp;
string data;
TaskAwaiter awaiter1, awaiter2;
void MoveNext() {
switch (state) {
case 0:
awaiter1 = client.GetAsync(url).GetAwaiter();
state = 1;
awaiter1.OnCompleted(MoveNext);
break;
case 1:
resp = awaiter1.GetResult();
awaiter2 = resp.Content.ReadAsStringAsync()
.GetAwaiter();
state = 2;
awaiter2.OnCompleted(MoveNext);
break;
case 2:
data = awaiter2.GetResult();
result.SetResult(data.Length);
break;
}
}
}
// Your code:
task<int> fetch(string url) {
auto resp = co_await client.get(url);
auto data = co_await resp.read_body();
co_return data.size();
}
// Generated coroutine frame:
struct FetchFrame {
int state = 0;
string url;
Response resp;
string data;
awaiter_t awaiter1, awaiter2;
void resume() {
switch (state) {
case 0:
awaiter1 = client.get(url).operator co_await();
state = 1;
awaiter1.await_suspend(handle);
break;
case 1:
resp = awaiter1.await_resume();
awaiter2 = resp.read_body().operator co_await();
state = 2;
awaiter2.await_suspend(handle);
break;
case 2:
data = awaiter2.await_resume();
promise.return_value(data.size());
break;
}
}
};
Understanding the fundamental difference: Preemptive vs Cooperative scheduling
// Go runtime can interrupt goroutines anywhere
func worker() {
for i := 0; i < 1000000; i++ {
// No explicit yield needed!
// Runtime can preempt here
doWork(i)
}
}
// ⚠️ Anti-pattern example (but won't deadlock!)
// Even infinite loop won't hang runtime
func badWorker() {
for {
x := computeSomething() // Runtime preempts
}
} // Other goroutines still run!
// GOMAXPROCS: M:N scheduling
// 1 million goroutines → 4-8 OS threads
runtime.GOMAXPROCS(8) // Max 8 parallel threads
go worker() // Scheduled by runtime
go worker() // Runtime handles fairness
go worker() // Can't monopolize CPU
// C++ coroutines: MUST explicitly yield
asio::awaitable<void> worker() {
for (int i = 0; i < 1000000; i++) {
// Must co_await to yield!
co_await asio::post(asio::use_awaitable);
doWork(i);
}
}
// ⚠️ DANGER: Anti-pattern example!
// Infinite loop WILL hang everything!
asio::awaitable<void> badWorker() {
for (;;) {
int x = computeSomething();
// Never yields → blocks entire thread!
}
} // All other coroutines blocked!
// Thread control: Explicit
asio::io_context io; // Single event loop
asio::co_spawn(io, worker(), asio::detached);
asio::co_spawn(io, worker(), asio::detached);
io.run(); // Run event loop (single thread)
co_await or your coroutine will block!
"But Go makes concurrency so easy with goroutines and channels!" — C++ can do that too.
// Buffered channel
ch := make(chan int, 10)
// Producer goroutine
go func() {
for i := 0; i < 100; i++ {
ch <- i // Send to channel
}
close(ch)
}()
// Consumer goroutine
go func() {
for val := range ch {
fmt.Println(val)
}
}()
// Wait for completion
time.Sleep(1 * time.Second)
// Lightweight: millions of goroutines possible
// Buffered channel
boost::fibers::buffered_channel<int> ch(10);
// Producer fiber
boost::fibers::fiber producer([&ch]() {
for (int i = 0; i < 100; i++) {
ch.push(i); // Send to channel
}
ch.close();
});
// Consumer fiber
boost::fibers::fiber consumer([&ch]() {
int val;
while (ch.pop(val) ==
boost::fibers::channel_op_status::success) {
std::cout << val << '\n';
}
});
// Wait for completion
producer.join();
consumer.join();
// Lightweight: cooperative scheduling, no GC!
// Traditional exception handling
User parseUser(const std::string& json) {
try {
auto data = json::parse(json);
return User{data["name"], data["age"]};
} catch (const json::exception& e) {
throw std::runtime_error("Parse failed");
}
}
// Usage: Hidden control flow
try {
auto user = parseUser(input);
process(user);
} catch (const std::exception& e) {
log_error(e.what());
}
// Problems:
// - Hidden control flow (where can exceptions come from?)
// - Performance cost (stack unwinding)
// - Not clear from signature that it can fail
// - Exception safety is hard to get right
// Explicit error handling (like Rust's Result)
std::expected<User, ParseError>
parseUser(const std::string& json) {
auto data = json::parse(json);
if (!data.contains("name")) {
return std::unexpected(ParseError::MissingField);
}
return User{data["name"], data["age"]};
}
// Usage: Explicit error handling
auto result = parseUser(input);
if (result) {
process(*result); // Success
} else {
log_error(result.error()); // Explicit handling
}
// Benefits:
// - Clear from signature: can succeed or fail
// - Zero-cost (no stack unwinding)
// - Explicit control flow
// - Composable with and_then, or_else, transform
std::optional<User> findUser(int id);awaitable<expected<Data, Error>>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
// Parse JSON
auto data = json::parse(request_body);
// Manual extraction - explicit
auto name = data["name"].get<string>();
auto age = data["age"].get<int>();
auto items = data["items"].get<vector<string>>();
// Build response
auto response = json{
{"status", "success"},
{"user", {{"id", 123}, {"name", name}}}
};
return response.dump();
using System.Text.Json;
// Define model
class User {
public string Name { get; set; }
public int Age { get; set; }
public List<string> Items { get; set; }
}
// Automatic via reflection
var user = JsonSerializer.Deserialize<User>(body);
// Use it
Console.WriteLine(user.Name);
#include <boost/describe.hpp>
#include <boost/json.hpp>
struct User {
std::string name;
int age;
std::vector<std::string> items;
};
// Compile-time metadata
BOOST_DESCRIBE_STRUCT(User, (), (name, age, items))
// Automatic deserialization
User user = boost::json::value_to<User>(
boost::json::parse(request_body)
);
using System.Text.Json;
class User {
public string Name { get; set; }
public int Age { get; set; }
public List<string> Items { get; set; }
}
// Runtime reflection
var user = JsonSerializer.Deserialize<User>(body);
import "encoding/json"
type User struct {
Name string `json:"name"`
Age int `json:"age"`
Items []string `json:"items"`
}
// Runtime reflection via tags
var user User
json.Unmarshal([]byte(body), &user)