Java API
Write and read Mosaic files from Java using Arrow Java (VectorSchemaRoot).
Setup
The Java API lives in the java/ directory and depends on the
mosaic_jni native library. Build the native library first:
cargo build --release -p mosaic-jni
Ensure libmosaic_jni.dylib (macOS) or libmosaic_jni.so (Linux)
is on java.library.path. The library is loaded automatically via:
System.loadLibrary("mosaic_jni");
Add the Arrow Java dependencies to your pom.xml:
<!-- Arrow BOM for version management -->
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-bom</artifactId>
<version>15.0.0</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-memory-netty</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-c-data</artifactId>
</dependency>
</dependencies>
Writing a File
1. Define an Arrow Schema
import org.apache.arrow.vector.types.pojo.*;
import org.apache.arrow.vector.types.*;
Schema arrowSchema = new Schema(Arrays.asList(
Field.notNullable("id", new ArrowType.Int(32, true)),
Field.nullable("name", ArrowType.Utf8.INSTANCE),
Field.nullable("score", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)),
Field.nullable("amount", new ArrowType.Decimal(10, 2, 128)),
Field.nullable("ts", new ArrowType.Timestamp(TimeUnit.MILLISECOND, null))
));
2. Create Writer and Write Batches
Build Arrow VectorSchemaRoot objects with Arrow Java and pass them
directly to write(). Data is transferred via the
Arrow C Data Interface
for zero-copy interop with the native library:
BufferAllocator allocator = new RootAllocator();
Schema arrowSchema = new Schema(Arrays.asList(
Field.notNullable("id", new ArrowType.Int(32, true)),
Field.nullable("name", ArrowType.Utf8.INSTANCE),
Field.nullable("score", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE))
));
WriterOptions opts = new WriterOptions()
.numBuckets(2);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try (MosaicWriter writer = new MosaicWriter(baos, arrowSchema, opts, allocator);
VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, allocator)) {
IntVector ids = (IntVector) root.getVector("id");
VarCharVector names = (VarCharVector) root.getVector("name");
Float8Vector scores = (Float8Vector) root.getVector("score");
ids.allocateNew(1000);
names.allocateNew(1000);
scores.allocateNew(1000);
for (int i = 0; i < 1000; i++) {
ids.set(i, i);
names.setSafe(i, ("user_" + i).getBytes());
scores.set(i, i * 1.5);
}
root.setRowCount(1000);
writer.write(root);
}
byte[] data = baos.toByteArray();
WriterOptions
| Option | Default | Description |
|---|---|---|
numBuckets(int) | 0 | Number of buckets (0 = auto) |
compression(int) | ZSTD (1) | 0 = none, 1 = Zstd |
zstdLevel(int) | 1 | Zstd compression level |
rowGroupMaxSize(long) | 256 MB | Max uncompressed bytes per row group |
maxDictTotalBytes(int) | 32 KB | Max dictionary size per column |
maxDictEntries(int) | 255 | Max distinct values for DICT encoding |
statsColumns(String...) | (empty) | Column names to build min/max stats for filter pushdown |
pageSizeThreshold(int) | 32 KB | Min avg column page size to enable paged mode (per-column compression) |
Writer Methods
| Method | Return | Description |
|---|---|---|
write(VectorSchemaRoot) | void | Write an Arrow batch (zero-copy via C Data Interface) |
estimatedFileSize() | long | Estimated output file size in bytes (for file rolling) |
close() | void | Flush remaining data and write footer |
numRowGroups() | int | Number of row groups written (available after close) |
getRowGroupStatistics(rg) | Map<String, ColumnStatistics> | Column statistics for a row group, keyed by column name (available after close) |
Reading a File
1. Open the Reader
The reader accepts any InputFile implementation (inspired by Parquet's InputFile),
giving you full control over the I/O source — local files, memory-mapped buffers,
remote storage adapters, etc.:
public interface InputFile {
void readFully(long position, byte[] buffer, int offset, int length) throws IOException;
}
BufferAllocator allocator = new RootAllocator();
InputFile inputFile = ...;
long fileLength = ...;
MosaicReader reader = MosaicReader.open(inputFile, fileLength, allocator);
2. Inspect the Schema
The reader exposes the file schema as a standard Arrow Schema object.
Columns are in the original input order by default:
Schema schema = reader.getSchema();
for (Field field : schema.getFields()) {
System.out.printf("name=%s type=%s nullable=%b%n",
field.getName(), field.getType(), field.isNullable());
}
Reader Methods
| Method | Return | Description |
|---|---|---|
getSchema() | Schema | Arrow Schema for the file (columns in original input order) |
numRowGroups() | int | Row group count |
rowGroupNumRows(rg) | int | Number of rows in a specific row group |
project(String[]) | void | Set projection: subsequent reads return only the named columns in the specified order |
readRowGroup(rg, allocator) | VectorSchemaRoot | Read a row group (all columns or projected columns if project() was called) |
getRowGroupStatistics(rg) | Map<String, ColumnStatistics> | Column statistics for a row group, keyed by column name |
3. Read Row Groups as Arrow VectorSchemaRoot
Each row group is read directly as an Arrow VectorSchemaRoot via
readRowGroup():
for (int rg = 0; rg < reader.numRowGroups(); rg++) {
try (VectorSchemaRoot batch = reader.readRowGroup(rg, allocator)) {
System.out.println("rows: " + batch.getRowCount());
IntVector ids = (IntVector) batch.getVector("id");
VarCharVector names = (VarCharVector) batch.getVector("name");
for (int i = 0; i < batch.getRowCount(); i++) {
if (!names.isNull(i)) {
System.out.printf("id=%d name=%s%n",
ids.get(i), new String(names.get(i)));
}
}
}
}
Projection Pushdown
Use project() to select and reorder columns by name.
Only the buckets containing the projected columns are decompressed, significantly
reducing I/O and memory for wide tables. The output preserves the order you specify:
// Only read "name" and "score" columns, in that order
reader.project(new String[]{"name", "score"});
try (VectorSchemaRoot batch = reader.readRowGroup(rg, allocator)) {
// batch contains only "name" and "score", in that order
}
// Empty projection: count-only read (0 columns, row count preserved)
reader.project(new String[]{});
int rowCount = reader.rowGroupNumRows(rg);
Column Statistics (Filter Pushdown)
When stats columns are configured during writing, statistics are available both from the writer (after close) and from the reader. This allows you to obtain min/max statistics immediately after writing without re-reading the file:
WriterOptions opts = new WriterOptions()
.statsColumns("id", "score"); // build stats for "id" and "score" columns
// Get stats directly from the writer after close
MosaicWriter writer = new MosaicWriter(baos, arrowSchema, opts, allocator);
writer.write(root);
writer.close();
for (int rg = 0; rg < writer.numRowGroups(); rg++) {
Map<String, ColumnStatistics> stats = writer.getRowGroupStatistics(rg);
ColumnStatistics idStats = stats.get("id");
// use stats for indexing, metadata, etc.
}
The reader can also access per-row-group statistics to skip row groups that don't match a filter predicate:
// Reading stats from the reader
for (int rg = 0; rg < reader.numRowGroups(); rg++) {
Map<String, ColumnStatistics> stats = reader.getRowGroupStatistics(rg);
for (Map.Entry<String, ColumnStatistics> entry : stats.entrySet()) {
String colName = entry.getKey();
ColumnStatistics stat = entry.getValue();
long nullCount = stat.getNullCount();
if (stat.hasMinMax()) {
byte[] min = stat.getMin(); // big-endian wire format
byte[] max = stat.getMax();
}
}
}
ColumnStatistics
| Method | Return | Description |
|---|---|---|
getNullCount() | long | Number of null values |
hasMinMax() | boolean | Whether min/max are available |
getMin() | byte[] | Min value as big-endian bytes (null if all-null) |
getMax() | byte[] | Max value as big-endian bytes (null if all-null) |
Min/max values are returned as big-endian byte arrays matching the type's wire format (e.g., 4 bytes for INTEGER, 8 bytes for BIGINT/DOUBLE, raw UTF-8 bytes for STRING).
Complete Example
import org.apache.paimon.mosaic.*;
import org.apache.arrow.memory.*;
import org.apache.arrow.vector.*;
import org.apache.arrow.vector.types.pojo.*;
import org.apache.arrow.vector.types.*;
import java.io.*;
import java.util.*;
// 1. Write
BufferAllocator allocator = new RootAllocator();
Schema arrowSchema = new Schema(Arrays.asList(
Field.nullable("id", new ArrowType.Int(32, true)),
Field.nullable("name", ArrowType.Utf8.INSTANCE),
Field.nullable("score", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE))
));
WriterOptions opts = new WriterOptions()
.numBuckets(2);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try (MosaicWriter writer = new MosaicWriter(baos, arrowSchema, opts, allocator);
VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, allocator)) {
IntVector ids = (IntVector) root.getVector("id");
VarCharVector names = (VarCharVector) root.getVector("name");
Float8Vector scores = (Float8Vector) root.getVector("score");
ids.allocateNew(100);
names.allocateNew(100);
scores.allocateNew(100);
for (int i = 0; i < 100; i++) {
ids.set(i, i);
names.setSafe(i, ("user_" + i).getBytes());
scores.set(i, i * 1.5);
}
root.setRowCount(100);
writer.write(root);
}
byte[] data = baos.toByteArray();
// 2. Read
MosaicReader reader = MosaicReader.open(
(pos, buf, off, len) -> System.arraycopy(data, (int) pos, buf, off, len),
data.length,
allocator);
try (reader) {
for (int rg = 0; rg < reader.numRowGroups(); rg++) {
try (VectorSchemaRoot batch = reader.readRowGroup(rg, allocator)) {
IntVector readIds = (IntVector) batch.getVector(0);
VarCharVector readNames = (VarCharVector) batch.getVector(1);
Float8Vector readScores = (Float8Vector) batch.getVector(2);
for (int i = 0; i < batch.getRowCount(); i++) {
System.out.printf("id=%d name=%s score=%.1f%n",
readIds.get(i),
new String(readNames.get(i)),
readScores.get(i));
}
}
}
}
project() to select and reorder output columns.
MosaicWriter,
MosaicReader)
implement AutoCloseable. Use try-with-resources to ensure native memory
is freed promptly. VectorSchemaRoot from readRowGroup()
should also be closed when done.