C++ API
Use Mosaic from C or C++ via the FFI bindings.
Overview
The ffi/ crate generates a shared library (libmosaic_ffi) and a
C header (mosaic.h) via cbindgen.
The C++ header (mosaic.hpp) is a hand-written RAII wrapper on top of the C API.
Building
# Build the FFI shared library
cargo build --release -p mosaic-ffi
# C header generated at include/mosaic.h
# C++ RAII wrapper: include/mosaic.hpp (checked in, not generated)
Linking
Link against the shared library and include the appropriate header:
# macOS
g++ -std=c++17 -I include/ example.cpp \
-L target/release -lmosaic_ffi -o example
# Linux
g++ -std=c++17 -I include/ example.cpp \
-L target/release -lmosaic_ffi -Wl,-rpath,target/release -o example
Writing (C++)
Data is written as Arrow RecordBatches via the
Arrow C Data Interface.
Build your data as Arrow arrays, export via ArrowArray / ArrowSchema,
then pass to the writer:
#include "mosaic.hpp"
#include <arrow/api.h>
#include <arrow/c/bridge.h>
int main() {
try {
// 1. Set up output stream callbacks
auto* fp = std::fopen("output.mosaic", "wb");
auto file = std::shared_ptr<FILE>(fp, [](FILE* f) { std::fclose(f); });
mosaic::OutputFile cbs;
cbs.write_fn = [file](const uint8_t* data, size_t len) -> int {
size_t written = std::fwrite(data, 1, len, file.get());
return (written == len) ? 0 : -1;
};
cbs.flush_fn = [file]() -> int { return std::fflush(file.get()); };
cbs.get_pos_fn = [file]() -> int64_t { return std::ftell(file.get()); };
// 2. Build an Arrow RecordBatch and write it
arrow::Int32Builder age_builder;
arrow::StringBuilder name_builder;
arrow::DoubleBuilder score_builder;
for (int i = 0; i < 10000; i++) {
name_builder.Append("user_" + std::to_string(i));
age_builder.Append(20 + (i % 50));
score_builder.Append(i * 1.5);
}
auto batch = arrow::RecordBatch::Make(
arrow::schema({
arrow::field("name", arrow::utf8()),
arrow::field("age", arrow::int32()),
arrow::field("score", arrow::float64()),
}),
10000,
{name_builder.Finish().ValueOrDie(),
age_builder.Finish().ValueOrDie(),
score_builder.Finish().ValueOrDie()});
// 3. Export via Arrow C Data Interface, create writer, and write
ArrowArray ffi_array;
ArrowSchema ffi_schema;
arrow::ExportRecordBatch(*batch, &ffi_array, &ffi_schema);
mosaic::WriterOptions opts;
opts.compression = 1; // ZSTD
opts.zstd_level = 1;
opts.num_buckets = 2;
mosaic::Writer writer(std::move(cbs), &ffi_schema, opts);
writer.write(&ffi_array, &ffi_schema);
// 4. Close (also happens on destructor)
writer.close();
} catch (const mosaic::Error& e) {
fprintf(stderr, "Error: %s\n", e.what());
return 1;
}
return 0;
}
C++ API Reference
Writer Options
| Field | Type | Default | Description |
|---|---|---|---|
num_buckets | uint32_t | 0 | Number of buckets (0 = auto) |
compression | uint8_t | 1 | 0=none, 1=zstd |
zstd_level | int32_t | 1 | Zstd compression level |
row_group_max_size | uint64_t | 256 MB | Max row group size |
max_dict_total_bytes | uint32_t | 32 KB | Max dict size per column |
max_dict_entries | uint32_t | 255 | Max dict entries per column |
stats_columns | const char* const* | NULL | Column names to build min/max stats for |
num_stats_columns | uint32_t | 0 | Length of stats_columns array |
page_size_threshold | uint32_t | 32 KB | Min avg column page size to enable paged mode |
Writer Methods
| Method | Return | Description |
|---|---|---|
write(&ffi_array, &ffi_schema) | void | Write an Arrow RecordBatch via C Data Interface |
estimated_file_size() | int64_t | Estimated output file size in bytes (for file rolling) |
close() | void | Flush remaining data and write footer |
num_row_groups() | uint32_t | Number of row groups written (available after close) |
get_row_group_statistics(rg) | vector<ColumnStatistics> | Column statistics for a row group (available after close); each entry has a column_name field |
Reading a File
1. Open the Reader
Construct a Reader by providing an InputFile
with your I/O implementation:
// Example: memory-mapped reader
mosaic::InputFile input;
input.read_at_fn = [data](uint64_t offset, uint8_t* buf, size_t len) -> int {
std::memcpy(buf, data + offset, len);
return 0;
};
auto reader = mosaic::make_reader(std::move(input), file_size);
2. Inspect the Schema
Export the schema via the Arrow C Data Interface and import into Arrow C++:
ArrowSchema ffi_schema;
reader.export_schema(&ffi_schema);
auto schema = arrow::ImportSchema(&ffi_schema).ValueOrDie();
Reader Methods
| Method | Return | Description |
|---|---|---|
num_row_groups() | uint32_t | Row group count |
row_group_num_rows(rg) | uint32_t | Number of rows in a specific row group |
export_schema(&ffi_schema) | void | Export schema via Arrow C Data Interface (columns in original input order) |
set_projection(cols, num_cols) | void | Set projection: subsequent reads return only the named columns in the specified order |
read_row_group(rg, &array, &schema) | void | Read a row group (all columns or projected columns if set_projection() was called) |
get_row_group_statistics(rg) | vector<ColumnStatistics> | Column statistics for a row group; each entry has a column_name field |
3. Read Row Groups as Arrow RecordBatch
Each row group is read directly via read_row_group(), which exports
via the Arrow C Data Interface
for zero-copy import into Arrow C++:
#include <arrow/c/bridge.h>
#include "mosaic.hpp"
for (uint32_t rg = 0; rg < reader.num_row_groups(); rg++) {
ArrowArray ffi_array;
ArrowSchema ffi_schema;
reader.read_row_group(rg, &ffi_array, &ffi_schema);
// Import into Arrow C++
auto batch = arrow::ImportRecordBatch(&ffi_array, &ffi_schema).ValueOrDie();
printf("row group %u: %lld rows, %d cols\n",
rg, batch->num_rows(), batch->num_columns());
// Access columns via Arrow C++ API
auto ages = std::static_pointer_cast<arrow::Int32Array>(
batch->GetColumnByName("age"));
for (int64_t i = 0; i < ages->length(); i++) {
if (!ages->IsNull(i)) {
printf("age=%d\n", ages->Value(i));
}
}
}
Projection Pushdown
Use set_projection() to select and reorder columns by name.
Only the buckets containing the projected columns are decompressed, reducing
I/O and memory for wide tables. The output preserves the order you specify:
// Only read "name" and "score" columns, in that order
const char* cols[] = { "name", "score" };
reader.set_projection(cols, 2);
ArrowArray ffi_array;
ArrowSchema ffi_schema;
reader.read_row_group(0, &ffi_array, &ffi_schema);
auto batch = arrow::ImportRecordBatch(&ffi_array, &ffi_schema).ValueOrDie();
// batch contains only "name" and "score", in that order
// Empty projection: count-only read (0 columns, row count preserved)
reader.set_projection(nullptr, 0);
uint32_t num_rows = reader.row_group_num_rows(0);
Column Statistics (Filter Pushdown)
When stats columns are configured during writing, statistics are available both from the writer (after close) and from the reader:
// Writing with stats (arrow_schema is an ArrowSchema* from C Data Interface)
const char* stats_cols[] = { "id", "score" };
mosaic::WriterOptions opts;
opts.compression = 1;
opts.stats_columns = stats_cols;
opts.num_stats_columns = 2;
mosaic::Writer writer(std::move(cbs), arrow_schema, opts);
// Get stats directly from the writer after close
writer.close();
for (uint32_t rg = 0; rg < writer.num_row_groups(); rg++) {
auto stats = writer.get_row_group_statistics(rg);
for (const auto& stat : stats) {
// stat.column_name identifies which column
uint64_t null_count = stat.null_count;
if (stat.has_min_max()) {
// stat.min_value / stat.max_value are std::vector<uint8_t>
}
}
}
// Or read stats from the reader
for (uint32_t rg = 0; rg < reader.num_row_groups(); rg++) {
auto stats = reader.get_row_group_statistics(rg);
for (const auto& stat : stats) {
// stat.column_name identifies which column
uint64_t null_count = stat.null_count;
if (stat.has_min_max()) {
// stat.min_value / stat.max_value are std::vector<uint8_t>
}
}
}
ColumnStatistics
| Field | Type | Description |
|---|---|---|
column_name | std::string | Column name |
null_count | uint64_t | Number of null values |
has_min_max() | bool | Whether min/max are available |
min_value | vector<uint8_t> | Min value as big-endian bytes (empty if all-null) |
max_value | vector<uint8_t> | Max value as big-endian bytes (empty if all-null) |
Min/max values are returned as big-endian byte arrays matching the type's wire format (e.g., 4 bytes for INTEGER, 8 bytes for BIGINT/DOUBLE, raw UTF-8 bytes for STRING).
Complete Example
#include "mosaic.hpp"
#include <arrow/api.h>
#include <arrow/c/bridge.h>
#include <cstdio>
#include <cstring>
#include <vector>
int main() {
try {
// 1. Write to a buffer
std::vector<uint8_t> buf;
{
mosaic::OutputFile w_cbs;
w_cbs.write_fn = [&buf](const uint8_t* data, size_t len) -> int {
buf.insert(buf.end(), data, data + len);
return 0;
};
w_cbs.flush_fn = []() -> int { return 0; };
w_cbs.get_pos_fn = [&buf]() -> int64_t {
return static_cast<int64_t>(buf.size());
};
// Build an Arrow RecordBatch
arrow::Int32Builder id_builder;
arrow::StringBuilder name_builder;
arrow::DoubleBuilder score_builder;
for (int i = 0; i < 100; i++) {
id_builder.Append(i);
name_builder.Append("user_" + std::to_string(i));
score_builder.Append(i * 1.5);
}
auto batch = arrow::RecordBatch::Make(
arrow::schema({
arrow::field("id", arrow::int32(), false),
arrow::field("name", arrow::utf8()),
arrow::field("score", arrow::float64()),
}),
100,
{id_builder.Finish().ValueOrDie(),
name_builder.Finish().ValueOrDie(),
score_builder.Finish().ValueOrDie()});
// Export via Arrow C Data Interface and create writer
ArrowArray ffi_array;
ArrowSchema ffi_schema;
arrow::ExportRecordBatch(*batch, &ffi_array, &ffi_schema);
mosaic::WriterOptions w_opts;
w_opts.num_buckets = 2;
mosaic::Writer writer(std::move(w_cbs), &ffi_schema, w_opts);
writer.write(&ffi_array, &ffi_schema);
writer.close();
}
// 2. Read from the buffer
mosaic::InputFile input;
input.read_at_fn = [&buf](uint64_t offset, uint8_t* dst, size_t len) -> int {
std::memcpy(dst, buf.data() + offset, len);
return 0;
};
auto reader = mosaic::make_reader(std::move(input), buf.size());
ArrowArray ffi_array;
ArrowSchema ffi_schema;
reader.read_row_group(0, &ffi_array, &ffi_schema);
auto result = arrow::ImportRecordBatch(&ffi_array, &ffi_schema).ValueOrDie();
auto ids = std::static_pointer_cast<arrow::Int32Array>(result->GetColumnByName("id"));
auto names = std::static_pointer_cast<arrow::StringArray>(result->GetColumnByName("name"));
auto scores = std::static_pointer_cast<arrow::DoubleArray>(result->GetColumnByName("score"));
} catch (const mosaic::Error& e) {
fprintf(stderr, "Error: %s\n", e.what());
return 1;
}
return 0;
}
set_projection() to select and reorder output columns.
Writer,
Reader)
are move-only RAII types. Resources are freed automatically when objects go out of scope.