From 10e1756c2c58d70dad662686a0934023ab99d479 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:35:07 +0000 Subject: [PATCH 01/29] virtualize DataSet --- .../github/jbellis/jvector/example/Bench.java | 4 +- .../github/jbellis/jvector/example/Grid.java | 14 +- .../example/benchmarks/AccuracyBenchmark.java | 6 +- .../example/benchmarks/CountBenchmark.java | 2 +- .../benchmarks/ExecutionTimeBenchmark.java | 2 +- .../example/benchmarks/LatencyBenchmark.java | 2 +- .../example/benchmarks/QueryExecutor.java | 2 +- .../benchmarks/ThroughputBenchmark.java | 2 +- .../example/util/CompressorParameters.java | 4 +- .../jbellis/jvector/example/util/DataSet.java | 148 +----------------- .../jvector/example/util/DataSetCreator.java | 2 +- .../jvector/example/util/Hdf5Loader.java | 2 +- .../example/util/MultiFileDatasource.java | 2 +- .../jvector/example/yaml/Compression.java | 2 +- .../jvector/microbench/GraphBuildBench.java | 6 +- 15 files changed, 34 insertions(+), 166 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 4623cbe9d..2b1ccc330 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -57,7 +57,7 @@ public static void main(String[] args) throws IOException { List> buildCompression = Arrays.asList( ds -> new PQParameters(ds.getDimension() / 8, 256, - ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN, + ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN, UNWEIGHTED), __ -> CompressorParameters.NONE ); @@ -66,7 +66,7 @@ public static void main(String[] args) throws IOException { // ds -> new CompressorParameters.BQParameters(), ds -> new PQParameters(ds.getDimension() / 8, 256, - ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN, + ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN, UNWEIGHTED) ); List> featureSets = Arrays.asList( diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java index a4d62645f..a2c8f0d61 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java @@ -175,7 +175,7 @@ static void runOneGraph(List> featureSets, } else { long start = System.nanoTime(); cv = compressor.encodeAll(ds.getBaseRavv()); - System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.baseVectors.size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0); + System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.getBaseVectors().size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0); } indexes.forEach((features, index) -> { @@ -211,7 +211,7 @@ private static Map, ImmutableGraphIndex> buildOnDisk(List, ImmutableGraphIndex> buildInMemory(List, ImmutableGraphIndex> indexes = new HashMap<>(); long start; - var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.similarityFunction); + var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.getSimilarityFunction()); GraphIndexBuilder builder = new GraphIndexBuilder(bsp, floatVectors.dimension(), M, @@ -675,17 +675,17 @@ public static class ConfiguredSystem implements AutoCloseable { public SearchScoreProvider scoreProviderFor(VectorFloat queryVector, ImmutableGraphIndex.View view) { // if we're not compressing then just use the exact score function if (cv == null) { - return DefaultSearchScoreProvider.exact(queryVector, ds.similarityFunction, ds.getBaseRavv()); + return DefaultSearchScoreProvider.exact(queryVector, ds.getSimilarityFunction(), ds.getBaseRavv()); } var scoringView = (ImmutableGraphIndex.ScoringView) view; ScoreFunction.ApproximateScoreFunction asf; if (features.contains(FeatureId.FUSED_ADC)) { - asf = scoringView.approximateScoreFunctionFor(queryVector, ds.similarityFunction); + asf = scoringView.approximateScoreFunctionFor(queryVector, ds.getSimilarityFunction()); } else { - asf = cv.precomputedScoreFunctionFor(queryVector, ds.similarityFunction); + asf = cv.precomputedScoreFunctionFor(queryVector, ds.getSimilarityFunction()); } - var rr = scoringView.rerankerFor(queryVector, ds.similarityFunction); + var rr = scoringView.rerankerFor(queryVector, ds.getSimilarityFunction()); return new DefaultSearchScoreProvider(asf, rr); } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java index a99aca6f8..4cb72d1a5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java @@ -88,7 +88,7 @@ public List runBenchmark( throw new RuntimeException("At least one metric must be displayed"); } - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); // execute all queries in parallel and collect results List results = IntStream.range(0, totalQueries) @@ -101,14 +101,14 @@ public List runBenchmark( if (computeRecall) { // compute recall for this run double recall = AccuracyMetrics.recallFromSearchResults( - cs.getDataSet().groundTruth, results, topK, topK + cs.getDataSet().getGroundTruth(), results, topK, topK ); list.add(Metric.of("Recall@" + topK, formatRecall, recall)); } if (computeMAP) { // compute recall for this run double map = AccuracyMetrics.meanAveragePrecisionAtK( - cs.getDataSet().groundTruth, results, topK + cs.getDataSet().getGroundTruth(), results, topK ); list.add(Metric.of("MAP@" + topK, formatMAP, map)); } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java index d4fe68456..cd5d228c2 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java @@ -105,7 +105,7 @@ public List runBenchmark( LongAdder nodesVisited = new LongAdder(); LongAdder nodesExpanded = new LongAdder(); LongAdder nodesExpandedBaseLayer = new LongAdder(); - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); for (int run = 0; run < queryRuns; run++) { IntStream.range(0, totalQueries) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java index 449a8409f..9872142d5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java @@ -56,7 +56,7 @@ public List runBenchmark( boolean usePruning, int queryRuns) { - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); double totalRuntime = 0; for (int run = 0; run < queryRuns; run++) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java index 861a8d2be..eefc5ee5c 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java @@ -104,7 +104,7 @@ public List runBenchmark( throw new IllegalArgumentException("At least one parameter must be set to true"); } - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); double mean = 0.0; double m2 = 0.0; int count = 0; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java index 9ec728808..3c202c28b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java @@ -33,7 +33,7 @@ public class QueryExecutor { * @return the SearchResult for query i. */ public static SearchResult executeQuery(ConfiguredSystem cs, int topK, int rerankK, boolean usePruning, int i) { - var queryVector = cs.getDataSet().queryVectors.get(i); + var queryVector = cs.getDataSet().getQueryVectors().get(i); var searcher = cs.getSearcher(); searcher.usePruning(usePruning); var sf = cs.scoreProviderFor(queryVector, searcher.getView()); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java index 27b99fa71..c00893fa5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java @@ -137,7 +137,7 @@ public List runBenchmark( throw new RuntimeException("At least one metric must be displayed"); } - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); int dim = cs.getDataSet().getDimension(); // Warmup Phase with diagnostics diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java index e1ffebb9b..f84b69938 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java @@ -55,7 +55,7 @@ public VectorCompressor computeCompressor(DataSet ds) { @Override public String idStringFor(DataSet ds) { - return String.format("PQ_%s_%d_%d_%s_%s", ds.name, m, k, isCentered, anisotropicThreshold); + return String.format("PQ_%s_%d_%d_%s_%s", ds.getName(), m, k, isCentered, anisotropicThreshold); } @Override @@ -85,7 +85,7 @@ public VectorCompressor computeCompressor(DataSet ds) { @Override public String idStringFor(DataSet ds) { - return String.format("NVQ_%s_%d_%s", ds.name, nSubVectors); + return String.format("NVQ_%s_%d_%s", ds.getName(), nSubVectors); } @Override diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java index e193cd6ad..4b39ad23b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java @@ -16,151 +16,19 @@ package io.github.jbellis.jvector.example.util; -import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; import io.github.jbellis.jvector.graph.RandomAccessVectorValues; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; -import io.github.jbellis.jvector.vector.VectorUtil; import io.github.jbellis.jvector.vector.types.VectorFloat; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -public class DataSet { - public final String name; - public final VectorSimilarityFunction similarityFunction; - public final List> baseVectors; - public final List> queryVectors; - public final List> groundTruth; - private RandomAccessVectorValues baseRavv; +public interface DataSet { + String getName(); + VectorSimilarityFunction getSimilarityFunction(); + List> getBaseVectors(); + List> getQueryVectors(); + List> getGroundTruth(); + int getDimension(); + RandomAccessVectorValues getBaseRavv(); - public DataSet(String name, - VectorSimilarityFunction similarityFunction, - List> baseVectors, - List> queryVectors, - List> groundTruth) - { - if (baseVectors.isEmpty()) { - throw new IllegalArgumentException("Base vectors must not be empty"); - } - if (queryVectors.isEmpty()) { - throw new IllegalArgumentException("Query vectors must not be empty"); - } - if (groundTruth.isEmpty()) { - throw new IllegalArgumentException("Ground truth vectors must not be empty"); - } - - if (baseVectors.get(0).length() != queryVectors.get(0).length()) { - throw new IllegalArgumentException("Base and query vectors must have the same dimensionality"); - } - if (queryVectors.size() != groundTruth.size()) { - throw new IllegalArgumentException("Query and ground truth lists must be the same size"); - } - - this.name = name; - this.similarityFunction = similarityFunction; - this.baseVectors = baseVectors; - this.queryVectors = queryVectors; - this.groundTruth = groundTruth; - - System.out.format("%n%s: %d base and %d query vectors created, dimensions %d%n", - name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length()); - } - - /** - * Return a dataset containing the given vectors, scrubbed free from zero vectors and normalized to unit length. - * Note: This only scrubs and normalizes for dot product similarity. - */ - public static DataSet getScrubbedDataSet(String pathStr, - VectorSimilarityFunction vsf, - List> baseVectors, - List> queryVectors, - List> groundTruth) - { - // remove zero vectors and duplicates, noting that this will change the indexes of the ground truth answers - List> scrubbedBaseVectors; - List> scrubbedQueryVectors; - List> gtSet; - scrubbedBaseVectors = new ArrayList<>(baseVectors.size()); - scrubbedQueryVectors = new ArrayList<>(queryVectors.size()); - gtSet = new ArrayList<>(groundTruth.size()); - var uniqueVectors = new TreeSet>((a, b) -> { - assert a.length() == b.length(); - for (int i = 0; i < a.length(); i++) { - if (a.get(i) < b.get(i)) { - return -1; - } - if (a.get(i) > b.get(i)) { - return 1; - } - } - return 0; - }); - Map rawToScrubbed = new HashMap<>(); - { - int j = 0; - for (int i = 0; i < baseVectors.size(); i++) { - VectorFloat v = baseVectors.get(i); - var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; - if (valid && uniqueVectors.add(v)) { - scrubbedBaseVectors.add(v); - rawToScrubbed.put(i, j++); - } - } - } - // also remove zero query vectors and query vectors that are present in the base set - for (int i = 0; i < queryVectors.size(); i++) { - VectorFloat v = queryVectors.get(i); - var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; - var dupe = uniqueVectors.contains(v); - if (valid && !dupe) { - scrubbedQueryVectors.add(v); - var gt = new ArrayList(); - for (int j : groundTruth.get(i)) { - gt.add(rawToScrubbed.get(j)); - } - gtSet.add(gt); - } - } - - // now that the zero vectors are removed, we can normalize if it looks like they aren't already - if (vsf == VectorSimilarityFunction.DOT_PRODUCT) { - if (Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5) { - normalizeAll(scrubbedBaseVectors); - normalizeAll(scrubbedQueryVectors); - } - } - - assert scrubbedQueryVectors.size() == gtSet.size(); - return new DataSet(pathStr, vsf, scrubbedBaseVectors, scrubbedQueryVectors, gtSet); - } - - private static void normalizeAll(Iterable> vectors) { - for (VectorFloat v : vectors) { - VectorUtil.l2normalize(v); - } - } - - private static float normOf(VectorFloat baseVector) { - float norm = 0; - for (int i = 0; i < baseVector.length(); i++) { - norm += baseVector.get(i) * baseVector.get(i); - } - return (float) Math.sqrt(norm); - } - - public int getDimension() { - return baseVectors.get(0).length(); - } - - public RandomAccessVectorValues getBaseRavv() { - if (baseRavv == null) { - baseRavv = new ListRandomAccessVectorValues(baseVectors, getDimension()); - } - return baseRavv; - } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java index 1cd532160..40a709f6a 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java @@ -73,6 +73,6 @@ public static DataSet create2DGrid(int nPoints, int nQueries, int topK) { var groundTruth = queries.stream().map(Map.Entry::getValue).collect(Collectors.toList()); String name = "2D" + gridWidth; - return new DataSet(name, VectorSimilarityFunction.EUCLIDEAN, baseVectors, queryVectors, groundTruth); + return new SimpleDataSet(name, VectorSimilarityFunction.EUCLIDEAN, baseVectors, queryVectors, groundTruth); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java index 7dfdccc07..baca10f5f 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java @@ -82,6 +82,6 @@ else if (filename.contains("-euclidean")) { } } - return DataSet.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets); + return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java index 6f875e23c..8bba2bd88 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java @@ -51,7 +51,7 @@ public DataSet load() throws IOException { var baseVectors = SiftLoader.readFvecs("fvec/" + basePath); var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath); var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath); - return DataSet.getScrubbedDataSet(name, VectorSimilarityFunction.COSINE, baseVectors, queryVectors, gtVectors); + return DataSetUtils.getScrubbedDataSet(name, VectorSimilarityFunction.COSINE, baseVectors, queryVectors, gtVectors); } public static Map byName = new HashMap<>() {{ diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java index fe7d4d82d..ef1013e80 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java @@ -42,7 +42,7 @@ public Function getCompressorParameters() { return ds -> { boolean centerData; if (strCenterData == null) { - centerData = ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN; + centerData = ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN; } else { centerData = strCenterData.equals("Yes");; } diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java index 9db25e642..df699680a 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java @@ -45,7 +45,7 @@ public static class Parameters { public Parameters() { this.ds = Hdf5Loader.load("hdf5/glove-100-angular.hdf5"); - this.ravv = new ListRandomAccessVectorValues(ds.baseVectors, ds.baseVectors.get(0).length()); + this.ravv = new ListRandomAccessVectorValues(ds.getBaseVectors(), ds.getBaseVectors().get(0).length()); } } @@ -54,7 +54,7 @@ public Parameters() { @OutputTimeUnit(TimeUnit.SECONDS) public void testGraphBuild(Blackhole bh, Parameters p) { long start = System.nanoTime(); - GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.similarityFunction, 8, 60, 1.2f, 1.4f, false); + GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.getSimilarityFunction(), 8, 60, 1.2f, 1.4f, false); graphIndexBuilder.build(p.ravv); System.out.format("Build M=%d ef=%d in %.2fs%n", 32, 600, (System.nanoTime() - start) / 1_000_000_000.0); @@ -65,7 +65,7 @@ public void testGraphBuild(Blackhole bh, Parameters p) { @OutputTimeUnit(TimeUnit.SECONDS) public void testGraphBuildWithHierarchy(Blackhole bh, Parameters p) { long start = System.nanoTime(); - GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.similarityFunction, 8, 60, 1.2f, 1.4f, true); + GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.getSimilarityFunction(), 8, 60, 1.2f, 1.4f, true); graphIndexBuilder.build(p.ravv); System.out.format("Build M=%d ef=%d in %.2fs%n", 32, 600, (System.nanoTime() - start) / 1_000_000_000.0); From 36ab6de65b75e6bf336c450de53d648ec3ce7d1a Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:35:27 +0000 Subject: [PATCH 02/29] wrap DataSet --- .../jvector/example/TestDataViewWrapper.java | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java new file mode 100644 index 000000000..213648443 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java @@ -0,0 +1,74 @@ +package io.github.jbellis.jvector.example; + +import io.github.jbellis.jvector.example.util.DataSet; +import io.github.jbellis.jvector.example.util.FloatVectorsWrapper; +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import io.nosqlbench.vectordata.discovery.TestDataView; +import io.nosqlbench.vectordata.spec.datasets.types.NeighborIndices; +import io.nosqlbench.vectordata.spec.datasets.types.QueryVectors; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public class TestDataViewWrapper implements DataSet { + public final TestDataView view; + private static final VectorTypeSupport vts = VectorizationProvider.getInstance().getVectorTypeSupport(); + + public TestDataViewWrapper(TestDataView view) { + this.view = view; + } + + @Override + public String getName() { + return view.getName(); + } + + @Override + public VectorSimilarityFunction getSimilarityFunction() { + var df = view.getDistanceFunction(); + return switch (df) { + case EUCLIDEAN -> VectorSimilarityFunction.EUCLIDEAN; + case COSINE -> VectorSimilarityFunction.COSINE; + case DOT_PRODUCT -> VectorSimilarityFunction.DOT_PRODUCT; + default -> throw new IllegalArgumentException("Unknown distance function " + df); + }; + } + + @Override + public List> getBaseVectors() { + throw new RuntimeException("This method should not be called. Use getBaseRavv() instead."); + } + + @Override + public List> getQueryVectors() { + QueryVectors queryVectors = view.getQueryVectors().orElseThrow(() -> new RuntimeException("unable to load query vectors")); + ArrayList> vectorFlaots = new ArrayList<>(queryVectors.getCount()); + for (float[] qv : queryVectors) { + vectorFlaots.add(vts.createFloatVector(qv)); + } + return vectorFlaots; + + } + + @Override + public List> getGroundTruth() { + Optional gt = view.getNeighborIndices(); + + return List.of(); + } + + @Override + public int getDimension() { + return view.getBaseVectors().get().getVectorDimensions(); + } + + @Override + public RandomAccessVectorValues getBaseRavv() { + return view.getBaseVectors().map(FloatVectorsWrapper::new).orElseThrow(() -> new RuntimeException("unable to load float vectors")); + } +} From 9bd9e547ba96ee83966e1d0b10d3a6e12a7a6ffd Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:35:58 +0000 Subject: [PATCH 03/29] virtualize loader merged complimentary logic between branches --- .../jbellis/jvector/example/BenchYAML.java | 12 ++++- .../jvector/example/util/DataSetLoader.java | 53 +++++++++++++++---- .../jvector/example/util/DataSetSource.java | 23 ++++++++ 3 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index e81a84863..cbe9e5a34 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -18,6 +18,7 @@ import io.github.jbellis.jvector.example.util.DataSet; import io.github.jbellis.jvector.example.util.DataSetLoader; +import io.github.jbellis.jvector.example.util.DataSetSource; import io.github.jbellis.jvector.example.yaml.DatasetCollection; import io.github.jbellis.jvector.example.yaml.MultiConfig; @@ -45,6 +46,7 @@ public static void main(String[] args) throws IOException { var pattern = Pattern.compile(regex); var datasetCollection = DatasetCollection.load(); + DataSetSource datasetSource = DataSetLoader.DEFAULT; var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); List allConfigs = new ArrayList<>(); @@ -53,7 +55,11 @@ public static void main(String[] args) throws IOException { System.out.println("Executing the following datasets: " + datasetNames); for (var datasetName : datasetNames) { - DataSet ds = DataSetLoader.loadDataSet(datasetName); + String finalDatasetName = datasetName; + DataSet ds = datasetSource.apply(datasetName) + .orElseThrow(() -> new IllegalArgumentException( + "Unknown dataset: " + finalDatasetName)); + // DataSet ds = DataSetLoader.loadDataSet(datasetName); if (datasetName.endsWith(".hdf5")) { datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length()); @@ -76,6 +82,10 @@ public static void main(String[] args) throws IOException { for (var config : allConfigs) { String datasetName = config.dataset; + DataSet ds = datasetSource.apply(datasetName) + .orElseThrow(() -> new IllegalArgumentException( + "Unknown dataset: " + datasetName)); + // DataSet ds = DataSetLoader.loadDataSet(datasetName); DataSet ds = DataSetLoader.loadDataSet(datasetName); Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction, diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java index e90a6f275..7b6eb4849 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java @@ -16,18 +16,49 @@ package io.github.jbellis.jvector.example.util; +import io.jhdf.api.Dataset; + import java.io.IOException; +import java.util.Optional; +import java.util.function.Function; + +public class DataSetLoader implements DataSetSource { + + private final Function>[] loaders; + + public DataSetLoader(DataSetSource... loaders) { + this.loaders = loaders; + } + + @Override + public Optional apply(String name) { + return Optional.empty(); + } + + public final static DataSetSource FVecsDownloader = new DataSetSource() { + @Override + public Optional apply(String name) { + var mfd = DownloadHelper.maybeDownloadFvecs(name); + try { + var ds = mfd.load(); + return Optional.of(ds); + } catch (IOException e) { + System.err.println("error while trying to load dataset: " + e + ", this error handling " + + "path needs to be updated"); + return Optional.empty(); + } + } + }; + + public final static DataSetSource HDF5Loader = new DataSetSource() { -public class DataSetLoader { - public static DataSet loadDataSet(String fileName) throws IOException { - DataSet ds; - if (fileName.endsWith(".hdf5")) { - DownloadHelper.maybeDownloadHdf5(fileName); - ds = Hdf5Loader.load(fileName); - } else { - var mfd = DownloadHelper.maybeDownloadFvecs(fileName); - ds = mfd.load(); - } - return ds; + @Override + public Optional apply(String name) { + if (name.endsWith(".hdf5")) { + DownloadHelper.maybeDownloadHdf5(name); + return Optional.of(Hdf5Loader.load(name)); + } + return Optional.empty(); } + }; } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java new file mode 100644 index 000000000..f08b11c7d --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java @@ -0,0 +1,23 @@ +package io.github.jbellis.jvector.example.util; + +import java.util.Optional; +import java.util.function.Function; + +public interface DataSetSource extends Function> { + public DataSetSource DEFAULT = new DataSetLoader(DataSetLoader.HDF5Loader, DataSetLoader.FVecsDownloader); + + public default DataSetSource and(DataSetSource... loaders) { + return new DataSetSource() { + @Override + public Optional apply(String name) { + for (var loader : loaders) { + var ds = loader.apply(name); + if (ds.isPresent()) { + return ds; + } + } + return Optional.empty(); + } + }; + } +} From 6709cc543e1728c229ae4fa60a95f39e69cab5a6 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:37:37 +0000 Subject: [PATCH 04/29] testrig --- jvector-examples/pom.xml | 96 +++++++----- .../jvector/example/testrig/BenchHarness.java | 139 ++++++++++++++++++ testrig | 22 +++ 3 files changed, 218 insertions(+), 39 deletions(-) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java create mode 100755 testrig diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml index bace97046..f43e0031e 100644 --- a/jvector-examples/pom.xml +++ b/jvector-examples/pom.xml @@ -14,8 +14,24 @@ 2.21.10 + + + + + + + + + + + + + + + + org.codehaus.mojo exec-maven-plugin @@ -27,27 +43,32 @@ org.apache.maven.plugins maven-assembly-plugin - 3.6.0 - - - jar-with-dependencies - - - - io.github.jbellis.jvector.example.AutoBenchYAML - - - + 3.7.1 - make-assembly package single + + + + true + io.github.jbellis.jvector.example.testrig.commands.TestRig_CMD + + + + + jar-with-dependencies + + + + + + @@ -72,17 +93,6 @@ - - - org.slf4j - slf4j-api - 2.0.9 - - - ch.qos.logback - logback-classic - 1.4.11 - software.amazon.awssdk s3-transfer-manager @@ -115,38 +125,40 @@ - com.kohlschutter.junixsocket - junixsocket-core - 2.8.1 - pom + io.nosqlbench + vectordata + 0.1.6 - com.fasterxml.jackson.core - jackson-databind - 2.17.1 + info.picocli + picocli + 4.7.6 - junit - junit - 4.13.1 - test + io.nosqlbench + nbvectors + 0.1.6 + - org.testng - testng - 7.3.0 - test + com.kohlschutter.junixsocket + junixsocket-core + 2.8.1 + pom org.junit.jupiter junit-jupiter-api - 5.11.4 test jdk11 + + 11 + 11 + @@ -187,6 +199,9 @@ jdk20 + + 20 + io.github.jbellis @@ -273,6 +288,9 @@ true + + 22 + io.github.jbellis diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java new file mode 100644 index 000000000..bb28c70ce --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java @@ -0,0 +1,139 @@ +package io.github.jbellis.jvector.example.testrig; + + +import io.github.jbellis.jvector.disk.ReaderSupplierFactory; +import io.github.jbellis.jvector.example.Grid; +import io.github.jbellis.jvector.example.util.CompressorParameters; +import io.github.jbellis.jvector.example.util.DataSet; +import io.github.jbellis.jvector.example.util.Hdf5Loader; +import io.github.jbellis.jvector.graph.GraphIndex; +import io.github.jbellis.jvector.graph.GraphIndexBuilder; +import io.github.jbellis.jvector.graph.OnHeapGraphIndex; +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; +import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndexWriter; +import io.github.jbellis.jvector.graph.disk.OrdinalMapper; +import io.github.jbellis.jvector.graph.disk.feature.Feature; +import io.github.jbellis.jvector.graph.disk.feature.FeatureId; +import io.github.jbellis.jvector.graph.disk.feature.FusedADC; +import io.github.jbellis.jvector.graph.disk.feature.InlineVectors; +import io.github.jbellis.jvector.graph.disk.feature.NVQ; +import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider; +import io.github.jbellis.jvector.quantization.CompressedVectors; +import io.github.jbellis.jvector.quantization.NVQuantization; +import io.github.jbellis.jvector.quantization.PQVectors; +import io.github.jbellis.jvector.quantization.ProductQuantization; +import io.github.jbellis.jvector.quantization.VectorCompressor; +import io.github.jbellis.jvector.util.PhysicalCoreExecutor; +import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator; +import io.nosqlbench.vectordata.VectorTestData; +import io.nosqlbench.vectordata.discovery.TestDataView; +import io.nosqlbench.vectordata.downloader.DatasetEntry; +import io.nosqlbench.vectordata.spec.datasets.types.BaseVectors; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.EnumMap; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.*; + +public class BenchHarness implements Runnable { + + private final DatasetEntry datasetEntry; + private final String profile; + private final int concurrency; + private final ExecutorService virtualThreadExecutor; + private final Semaphore semaphore; + + public BenchHarness( + io.nosqlbench.vectordata.downloader.DatasetEntry datasetEntry, + String profile + ) + { + this(datasetEntry, profile, 1); + } + + public BenchHarness( + io.nosqlbench.vectordata.downloader.DatasetEntry datasetEntry, + String profile, + int concurrency + ) + { + this.datasetEntry = datasetEntry; + this.profile = profile; + this.concurrency = concurrency; + this.virtualThreadExecutor = Executors.newVirtualThreadPerTaskExecutor(); + this.semaphore = new Semaphore(concurrency); + } + + @Override + public void run() { + TestDataView testDataView = datasetEntry.select().profile(profile); + smokeTestDataLoad(testDataView); + } + + private void smokeTestDataLoad(TestDataView testDataView) { + BaseVectors bv = testDataView.getBaseVectors().orElseThrow(); + + System.out.println("Prebuffering..."); + CompletableFuture prebuffer = bv.prebuffer(); + if (prebuffer instanceof ProgressIndicator indicator) { + indicator.monitorProgress(1000); + } + prebuffer.join(); + System.out.println("Prebuffered"); + + float[] v1 = bv.get(1); + System.out.println(Arrays.toString(v1)); + + float[] vend = bv.get(bv.getCount() - 1); + System.out.println(Arrays.toString(vend)); + + /// Create tasks for processing vectors concurrently + CompletableFuture[] futures = new CompletableFuture[100]; + + for (int i = 0; i < 100; i++) { + final int index = i; + futures[i] = CompletableFuture.runAsync(() -> { + try { + semaphore.acquire(); + try { + /// This will be a stepping through the space of vectors + int idx = (int) ((float)index / 100 * bv.getCount()); + float[] v = bv.get(idx); + System.out.println(Arrays.toString(v)); + } finally { + semaphore.release(); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException(e); + } + }, virtualThreadExecutor); + } + + /// Wait for all tasks to complete + CompletableFuture.allOf(futures).join(); + + /// Shutdown the executor + virtualThreadExecutor.shutdown(); + try { + if (!virtualThreadExecutor.awaitTermination(60, TimeUnit.SECONDS)) { + virtualThreadExecutor.shutdownNow(); + } + } catch (InterruptedException e) { + virtualThreadExecutor.shutdownNow(); + Thread.currentThread().interrupt(); + } + } + + +} diff --git a/testrig b/testrig new file mode 100755 index 000000000..965475242 --- /dev/null +++ b/testrig @@ -0,0 +1,22 @@ +#!/bin/bash +export JAVA_OPTS="--add-modules jdk.incubator.vector" +if [ ! -d "jvector-examples/target/classes" -o "$1" = "-r" ] +then + shift + printf "building project before invoking testrig...\n" 1>&2 + mvn -Pjdk22 compile package -pl :jvector-examples -am -DskipTests + status=$? + if (( status != 0 )) + then + printf "failed build, resolve issues before trying to run testrig\n" + exit $status + else + printf "built project successfully, continuing...\n" 1>&2 + echo -e "\a" + sleep 1 + echo -e "\a" + sleep 1 + fi +fi + +java $JAVA_OPTS -Xmx32g -jar ./jvector-examples/target/jvector-examples-*-SNAPSHOT-jar-with-dependencies.jar $* From 0d43398efd7efa3f6094cbef2f6d5ee0f0f11c88 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:37:58 +0000 Subject: [PATCH 05/29] syntax updates --- .../src/main/java/io/github/jbellis/jvector/example/Bench.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 2b1ccc330..814794774 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -86,7 +86,8 @@ public static void main(String[] args) throws IOException { private static void execute(Pattern pattern, List> buildCompression, List> featureSets, List> compressionGrid, List mGrid, List efConstructionGrid, List neighborOverflowGrid, List addHierarchyGrid, List refineFinalGraphGrid, Map> topKGrid, List usePruningGrid) throws IOException { var datasetCollection = DatasetCollection.load(); - var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); + var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).toList(); + System.out.println("Executing the following datasets: " + datasetNames); for (var datasetName : datasetNames) { From 6840b70b209b9dc6791076929c8da09f0233ad3c Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:38:05 +0000 Subject: [PATCH 06/29] use catalog --- .../main/java/io/github/jbellis/jvector/example/Bench.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 814794774..1d2c14372 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -20,11 +20,15 @@ import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters; import io.github.jbellis.jvector.example.util.DataSet; import io.github.jbellis.jvector.example.util.DataSetLoader; +import io.github.jbellis.jvector.example.util.DataSetSource; import io.github.jbellis.jvector.example.yaml.DatasetCollection; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.nosqlbench.vectordata.discovery.TestDataSources; +import io.nosqlbench.vectordata.downloader.Catalog; import java.io.IOException; +import java.nio.file.Path; import java.util.Arrays; import java.util.EnumSet; import java.util.List; @@ -91,7 +95,8 @@ private static void execute(Pattern pattern, List new RuntimeException("Unknown dataset: " + datasetName)); Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid); } } From 7464507559d4f7f4111ae079da1916e943864697 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:42:12 +0000 Subject: [PATCH 07/29] git ignore .claude and .junie --- .gitignore | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitignore b/.gitignore index 9fc38bae4..f4e7a5bc9 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,14 @@ hdf5/ ### aider .aider* +### claude +.claude/** + +### junie +.junie/** + # JMH generated files dependency-reduced-pom.xml results.csv + + From 00d946db90bc4953e83d6912198c4a52ad4aa176 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:42:30 +0000 Subject: [PATCH 08/29] small fixes --- .../github/jbellis/jvector/example/Bench.java | 8 ++ .../jvector/example/util/DataSetUtils.java | 112 ++++++++++++++++++ .../example/util/FloatVectorsWrapper.java | 53 +++++++++ .../jvector/example/util/SimpleDataSet.java | 104 ++++++++++++++++ 4 files changed, 277 insertions(+) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 1d2c14372..c1dbbecf2 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -33,6 +33,7 @@ import java.util.EnumSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.function.Function; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -89,11 +90,18 @@ public static void main(String[] args) throws IOException { } private static void execute(Pattern pattern, List> buildCompression, List> featureSets, List> compressionGrid, List mGrid, List efConstructionGrid, List neighborOverflowGrid, List addHierarchyGrid, List refineFinalGraphGrid, Map> topKGrid, List usePruningGrid) throws IOException { + + Catalog catalog = new TestDataSources().addOptionalCatalogs("~/.config/jvector/catalogs" + + ".yaml").catalog(); var datasetCollection = DatasetCollection.load(); var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).toList(); System.out.println("Executing the following datasets: " + datasetNames); + + DataSetSource datasetSource = DataSetSource.DEFAULT.and(name -> catalog.matchOne(name) + .map(dse -> dse.select().profile(name)).map(TestDataViewWrapper::new)); + for (var datasetName : datasetNames) { DataSet ds = datasetSource.apply(datasetName).orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName)); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java new file mode 100644 index 000000000..0a4a62421 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java @@ -0,0 +1,112 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.util; + +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorUtil; +import io.github.jbellis.jvector.vector.types.VectorFloat; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; + +public class DataSetUtils { + + /** + * Return a dataset containing the given vectors, scrubbed free from zero vectors and normalized to unit length. + * Note: This only scrubs and normalizes for dot product similarity. + */ + public static DataSet getScrubbedDataSet(String pathStr, + VectorSimilarityFunction vsf, + List> baseVectors, + List> queryVectors, + List> groundTruth) + { + // remove zero vectors and duplicates, noting that this will change the indexes of the ground truth answers + List> scrubbedBaseVectors; + List> scrubbedQueryVectors; + List> gtSet; + scrubbedBaseVectors = new ArrayList<>(baseVectors.size()); + scrubbedQueryVectors = new ArrayList<>(queryVectors.size()); + gtSet = new ArrayList<>(groundTruth.size()); + var uniqueVectors = new TreeSet>((a, b) -> { + assert a.length() == b.length(); + for (int i = 0; i < a.length(); i++) { + if (a.get(i) < b.get(i)) { + return -1; + } + if (a.get(i) > b.get(i)) { + return 1; + } + } + return 0; + }); + Map rawToScrubbed = new HashMap<>(); + { + int j = 0; + for (int i = 0; i < baseVectors.size(); i++) { + VectorFloat v = baseVectors.get(i); + var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; + if (valid && uniqueVectors.add(v)) { + scrubbedBaseVectors.add(v); + rawToScrubbed.put(i, j++); + } + } + } + // also remove zero query vectors and query vectors that are present in the base set + for (int i = 0; i < queryVectors.size(); i++) { + VectorFloat v = queryVectors.get(i); + var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; + var dupe = uniqueVectors.contains(v); + if (valid && !dupe) { + scrubbedQueryVectors.add(v); + var gt = new ArrayList(); + for (int j : groundTruth.get(i)) { + gt.add(rawToScrubbed.get(j)); + } + gtSet.add(gt); + } + } + + // now that the zero vectors are removed, we can normalize if it looks like they aren't already + if (vsf == VectorSimilarityFunction.DOT_PRODUCT) { + if (Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5) { + normalizeAll(scrubbedBaseVectors); + normalizeAll(scrubbedQueryVectors); + } + } + + assert scrubbedQueryVectors.size() == gtSet.size(); + return new SimpleDataSet(pathStr, vsf, scrubbedBaseVectors, scrubbedQueryVectors, gtSet); + } + + private static void normalizeAll(Iterable> vectors) { + for (VectorFloat v : vectors) { + VectorUtil.l2normalize(v); + } + } + + private static float normOf(VectorFloat baseVector) { + float norm = 0; + for (int i = 0; i < baseVector.length(); i++) { + norm += baseVector.get(i) * baseVector.get(i); + } + return (float) Math.sqrt(norm); + } +} \ No newline at end of file diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java new file mode 100644 index 000000000..d4b49e60e --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java @@ -0,0 +1,53 @@ +package io.github.jbellis.jvector.example.util; + +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.vector.ArrayVectorFloat; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import io.nosqlbench.vectordata.spec.datasets.types.FloatVectors; + +import java.util.function.Supplier; + +/// Wrapper that adapts a nosqlbench FloatVectors instance to implement RandomAccessVectorValues +public class FloatVectorsWrapper implements RandomAccessVectorValues { + private static final VectorTypeSupport vts = VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final FloatVectors floatVectors; + private final int dimension; + + public FloatVectorsWrapper(FloatVectors floatVectors) { + this.floatVectors = floatVectors; + this.dimension = floatVectors.getVectorDimensions(); + } + + @Override + public int size() { + return floatVectors.getCount(); + } + + @Override + public int dimension() { + return floatVectors.getVectorDimensions(); + } + + @Override + public VectorFloat getVector(int nodeId) { + return vts.createFloatVector(floatVectors.get(nodeId)); + } + + @Override + public boolean isValueShared() { + return true; + } + + @Override + public RandomAccessVectorValues copy() { + return new FloatVectorsWrapper(floatVectors); + } + + @Override + public Supplier threadLocalSupplier() { + return () -> new FloatVectorsWrapper(floatVectors); + } +} \ No newline at end of file diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java new file mode 100644 index 000000000..0d3c752dd --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java @@ -0,0 +1,104 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.util; + +import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.types.VectorFloat; + +import java.util.List; + +public class SimpleDataSet implements DataSet { + private final String name; + private final VectorSimilarityFunction similarityFunction; + private final List> baseVectors; + private final List> queryVectors; + private final List> groundTruth; + private RandomAccessVectorValues baseRavv; + + public SimpleDataSet(String name, + VectorSimilarityFunction similarityFunction, + List> baseVectors, + List> queryVectors, + List> groundTruth) + { + if (baseVectors.isEmpty()) { + throw new IllegalArgumentException("Base vectors must not be empty"); + } + if (queryVectors.isEmpty()) { + throw new IllegalArgumentException("Query vectors must not be empty"); + } + if (groundTruth.isEmpty()) { + throw new IllegalArgumentException("Ground truth vectors must not be empty"); + } + + if (baseVectors.get(0).length() != queryVectors.get(0).length()) { + throw new IllegalArgumentException("Base and query vectors must have the same dimensionality"); + } + if (queryVectors.size() != groundTruth.size()) { + throw new IllegalArgumentException("Query and ground truth lists must be the same size"); + } + + this.name = name; + this.similarityFunction = similarityFunction; + this.baseVectors = baseVectors; + this.queryVectors = queryVectors; + this.groundTruth = groundTruth; + + System.out.format("%n%s: %d base and %d query vectors created, dimensions %d%n", + name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length()); + } + + @Override + public String getName() { + return name; + } + + @Override + public VectorSimilarityFunction getSimilarityFunction() { + return similarityFunction; + } + + @Override + public List> getBaseVectors() { + return baseVectors; + } + + @Override + public List> getQueryVectors() { + return queryVectors; + } + + @Override + public List> getGroundTruth() { + return groundTruth; + } + + @Override + public int getDimension() { + return baseVectors.get(0).length(); + } + + @Override + public RandomAccessVectorValues getBaseRavv() { + if (baseRavv == null) { + baseRavv = new ListRandomAccessVectorValues(baseVectors, getDimension()); + } + return baseRavv; + } +} \ No newline at end of file From 1e8ce769f372e9d9c64d15cf80351528f60bfe2d Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:42:35 +0000 Subject: [PATCH 09/29] command refinements --- .../example/testrig/commands/Bench_CMD.java | 86 +++++++++++++++++++ .../example/testrig/commands/Run_CMD.java | 86 +++++++++++++++++++ .../example/testrig/commands/TestRig_CMD.java | 24 ++++++ 3 files changed, 196 insertions(+) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java new file mode 100644 index 000000000..33bb9b6c5 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java @@ -0,0 +1,86 @@ +package io.github.jbellis.jvector.example.testrig.commands; + +import io.github.jbellis.jvector.example.testrig.BenchHarness; +import io.nosqlbench.vectordata.VectorTestData; +import io.nosqlbench.vectordata.discovery.TestDataSources; +import io.nosqlbench.vectordata.downloader.Catalog; +import io.nosqlbench.vectordata.downloader.DatasetEntry; +import picocli.CommandLine; + +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; + +@CommandLine.Command(name = "bench", description = "run example benchmarks") +public class Bench_CMD implements Callable { + + public static class ExpanderExample implements Iterable { + @Override + public java.util.Iterator iterator() { + return VectorTestData.catalogs().catalog().datasets().stream().map(DatasetEntry::name) + .toList().iterator(); + } + } + + @CommandLine.Option(names = {"--catalog"}, + description = "A directory, remote url, or other catalog container") + List catalogs = new ArrayList<>(); + + @CommandLine.Option(names = {"--optional-catalog"}, + description = "A configuration file to use", + split = ",", + defaultValue = "~/.config/jvector/catalogs.yaml,~/.config/vectordata/catalogs.yaml") + List optionalCatalogs; + + @CommandLine.Option(names = {"-d", "--dataset"}, + description = "Dataset to use", + completionCandidates = ExpanderExample.class) + private List dsnames; + + @CommandLine.Option(names = {"-p", "--profile"}, + description = "Profile to use", + defaultValue = "default") + private String profile = "default"; + + @CommandLine.Option(names = {"-c", "--concurrency"}, + description = "Number of concurrent threads", + defaultValue = "1") + private int concurrency = 1; + + public static void main(String[] args) { + Bench_CMD command = new Bench_CMD(); + CommandLine commandLine = new CommandLine(command).setCaseInsensitiveEnumValuesAllowed(true) + .setOptionsCaseInsensitive(true); + int exitCode = commandLine.execute(args); + System.exit(exitCode); + } + + @Override + public Integer call() throws Exception { + System.out.println("Test Rig run with datasets: " + dsnames); + for (String dsname : dsnames) { + String[] nameparts = dsname.split(":+", 2); + String _name = null, _profile = this.profile; + switch (nameparts.length) { + case 2: + _profile = nameparts[1]; + case 1: + _name = nameparts[0]; + break; + } + + System.out.println("Using testdata source " + _name); + System.out.println("Using profile " + _profile); + + Catalog catalog = new TestDataSources().addOptionalCatalogs(optionalCatalogs).catalog(); + DatasetEntry ds = catalog.findExact(_name).orElseThrow(); + // VectorTestData.catalogs().catalog().findExact(_name).orElseThrow(); + BenchHarness harness = new BenchHarness(ds, _profile, concurrency); + + harness.run(); + + } + return 0; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java new file mode 100644 index 000000000..93cdaa83e --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java @@ -0,0 +1,86 @@ +package io.github.jbellis.jvector.example.testrig.commands; + +import io.github.jbellis.jvector.example.testrig.BenchHarness; +import io.nosqlbench.vectordata.VectorTestData; +import io.nosqlbench.vectordata.discovery.TestDataSources; +import io.nosqlbench.vectordata.downloader.Catalog; +import io.nosqlbench.vectordata.downloader.DatasetEntry; +import picocli.CommandLine; + +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; + +@CommandLine.Command(name = "run", description = "Run a testrig command") +public class Run_CMD implements Callable { + + public static class ExpanderExample implements Iterable { + @Override + public java.util.Iterator iterator() { + return VectorTestData.catalogs().catalog().datasets().stream().map(DatasetEntry::name) + .toList().iterator(); + } + } + + @CommandLine.Option(names = {"--catalog"}, + description = "A directory, remote url, or other catalog container") + List catalogs = new ArrayList<>(); + + @CommandLine.Option(names = {"--optional-catalog"}, + description = "A configuration file to use", + split = ",", + defaultValue = "~/.config/jvector/catalogs.yaml,~/.config/vectordata/catalogs.yaml") + List optionalCatalogs; + + @CommandLine.Option(names = {"-d", "--dataset"}, + description = "Dataset to use", + completionCandidates = ExpanderExample.class) + private List dsnames; + + @CommandLine.Option(names = {"-p", "--profile"}, + description = "Profile to use", + defaultValue = "default") + private String profile = "default"; + + @CommandLine.Option(names = {"-c", "--concurrency"}, + description = "Number of concurrent threads", + defaultValue = "1") + private int concurrency = 1; + + public static void main(String[] args) { + Run_CMD command = new Run_CMD(); + CommandLine commandLine = new CommandLine(command).setCaseInsensitiveEnumValuesAllowed(true) + .setOptionsCaseInsensitive(true); + int exitCode = commandLine.execute(args); + System.exit(exitCode); + } + + @Override + public Integer call() throws Exception { + System.out.println("Test Rig run with datasets: " + dsnames); + for (String dsname : dsnames) { + String[] nameparts = dsname.split(":+", 2); + String _name = null, _profile = this.profile; + switch (nameparts.length) { + case 2: + _profile = nameparts[1]; + case 1: + _name = nameparts[0]; + break; + } + + System.out.println("Using testdata source " + _name); + System.out.println("Using profile " + _profile); + + Catalog catalog = new TestDataSources().addOptionalCatalogs(optionalCatalogs).catalog(); + DatasetEntry ds = catalog.findExact(_name).orElseThrow(); + // VectorTestData.catalogs().catalog().findExact(_name).orElseThrow(); + BenchHarness harness = new BenchHarness(ds, _profile, concurrency); + + harness.run(); + + } + return 0; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java new file mode 100644 index 000000000..1c4f823f9 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java @@ -0,0 +1,24 @@ +package io.github.jbellis.jvector.example.testrig.commands; + +import io.nosqlbench.command.datasets.CMD_datasets; +import picocli.AutoComplete; +import picocli.CommandLine; + +@CommandLine.Command(name = "testrig", + header = "JVector Test Rig", + mixinStandardHelpOptions = true, + description = "JVector Test Rig", + subcommands = {CommandLine.HelpCommand.class, AutoComplete.GenerateCompletion.class, + Run_CMD.class, CMD_datasets.class}) +public class TestRig_CMD { + + public static void main(String[] args) { + @SuppressWarnings("InstantiationOfUtilityClass") TestRig_CMD command = new TestRig_CMD(); + CommandLine commandLine = + new CommandLine(command).setCaseInsensitiveEnumValuesAllowed(true) + .setOptionsCaseInsensitive(true); + int exitCode = commandLine.execute(args); + System.exit(exitCode); + } + +} From 384f4f2eea45b877bdaecdd2c5b1007df4cfffc0 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Wed, 20 Aug 2025 18:47:04 +0000 Subject: [PATCH 10/29] add a section for streamable datasets --- jvector-examples/yaml-configs/datasets.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/jvector-examples/yaml-configs/datasets.yml b/jvector-examples/yaml-configs/datasets.yml index 3fd67217e..8146196cf 100644 --- a/jvector-examples/yaml-configs/datasets.yml +++ b/jvector-examples/yaml-configs/datasets.yml @@ -1,3 +1,7 @@ +streamable: + - cohere_msmarco + + neighborhood-watch-100k: - ada002-100k - cohere-english-v3-100k From 21c95d4f224390d327855da2332a30667ac5634a Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 21 Aug 2025 17:52:25 +0000 Subject: [PATCH 11/29] pq ranging bugfix and refactoring --- .../jvector/quantization/PQLayout.java | 68 +++++++++++++++++++ .../quantization/TestProductQuantization.java | 12 ++-- 2 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQLayout.java diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQLayout.java b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQLayout.java new file mode 100644 index 000000000..95350218c --- /dev/null +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQLayout.java @@ -0,0 +1,68 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.quantization; + +/** + * Chunk Dimensions and Layout + * This is emulative of modern Java records, but keeps to J11 standards. + * This class consolidates the layout calculations for PQ data into one place + */ +public class PQLayout { + + /** total number of vectors **/ + public final int vectorCount; + /** total number of chunks, including any partial **/ + public final int totalChunks; + /** total number of fully-filled chunks **/ + public final int fullSizeChunks; + /** number of vectore per fullSize chunk **/ + public final int fullChunkVectors; + /** number of vectors in last partially filled chunk, if any **/ + public final int lastChunkVectors; + /** compressed dimension of vectors **/ + public final int compressedDimension; + /** number of bytes in each fully-filled chunk **/ + public final int fullChunkBytes; + /** number of bytes in the last partially-filled chunk, if any **/ + public final int lastChunkBytes; + + public PQLayout(int vectorCount, int compressedDimension) { + if (vectorCount < 0) { + throw new IllegalArgumentException("Invalid vector count " + vectorCount); + } + this.vectorCount = vectorCount; + + if (compressedDimension < 0) { + throw new IllegalArgumentException("Invalid compressed dimension " + compressedDimension); + } + this.compressedDimension = compressedDimension; + + long totalSize = (long) vectorCount * compressedDimension; + + this.fullChunkVectors = totalSize <= PQVectors.MAX_CHUNK_SIZE ? vectorCount : PQVectors.MAX_CHUNK_SIZE / compressedDimension; + if (fullChunkVectors == 0) { + throw new IllegalArgumentException("Compressed dimension " + compressedDimension + " too large for chunking"); + } + this.lastChunkVectors = vectorCount % this.fullChunkVectors; + + this.fullChunkBytes = Math.multiplyExact(compressedDimension, this.fullChunkVectors); + this.lastChunkBytes = Math.multiplyExact(compressedDimension, lastChunkVectors); + + this.fullSizeChunks = vectorCount / fullChunkVectors; + this.totalChunks = fullSizeChunks + ((vectorCount % fullChunkVectors == 0) ? 0 : 1); + } +} diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestProductQuantization.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestProductQuantization.java index db35b52d1..ab4087f8f 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestProductQuantization.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestProductQuantization.java @@ -352,7 +352,7 @@ public void testPQLayoutEdgeCases() { int[][] testCases = { // Minimal cases {1, 1}, {1, 2}, - + // Power-of-2 boundaries for compressedDimension (layoutBytesPerVector changes) {10, 1}, {10, 2}, {10, 3}, {10, 4}, {10, 5}, {10, 7}, {10, 8}, {10, 9}, @@ -360,20 +360,20 @@ public void testPQLayoutEdgeCases() { {10, 31}, {10, 32}, {10, 33}, {10, 63}, {10, 64}, {10, 65}, {10, 127}, {10, 128}, {10, 129}, - + // Cases where addressableVectorsPerChunk becomes interesting {1073741823, 1}, // layoutBytesPerVector=2, addressableVectorsPerChunk=1073741823 - {1073741823, 2}, // layoutBytesPerVector=4, addressableVectorsPerChunk=536870911 + {1073741823, 2}, // layoutBytesPerVector=4, addressableVectorsPerChunk=536870911 {1073741824, 2}, // vectorCount > addressableVectorsPerChunk, creates chunks - + // Large dimension cases (small addressableVectorsPerChunk) {1000, 1024}, // layoutBytesPerVector=2048, addressableVectorsPerChunk=1048575 {2000000, 1024}, // vectorCount > addressableVectorsPerChunk - + // Integer overflow boundary cases {536870911, 4}, // layoutBytesPerVector=8, exactly fits in one chunk {536870912, 4}, // one more than above, creates multiple chunks - + // Edge case where lastChunkVectors becomes non-zero {100, 1073741824} // layoutBytesPerVector huge, addressableVectorsPerChunk=1, creates 100 chunks }; From c2384ad838f9dc64e15ff4c1d8242aaa71fc272b Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 21 Aug 2025 17:52:50 +0000 Subject: [PATCH 12/29] demo fix --- .../github/jbellis/jvector/example/Bench.java | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index c1dbbecf2..f7dbb886a 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -19,21 +19,19 @@ import io.github.jbellis.jvector.example.util.CompressorParameters; import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters; import io.github.jbellis.jvector.example.util.DataSet; -import io.github.jbellis.jvector.example.util.DataSetLoader; import io.github.jbellis.jvector.example.util.DataSetSource; import io.github.jbellis.jvector.example.yaml.DatasetCollection; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.nosqlbench.vectordata.discovery.TestDataSources; +import io.nosqlbench.vectordata.discovery.TestDataView; import io.nosqlbench.vectordata.downloader.Catalog; +import io.nosqlbench.vectordata.downloader.DatasetEntry; +import io.nosqlbench.vectordata.spec.datasets.types.DatasetView; +import org.jetbrains.annotations.NotNull; import java.io.IOException; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; -import java.util.Map; -import java.util.Optional; +import java.util.*; import java.util.function.Function; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -91,21 +89,36 @@ public static void main(String[] args) throws IOException { private static void execute(Pattern pattern, List> buildCompression, List> featureSets, List> compressionGrid, List mGrid, List efConstructionGrid, List neighborOverflowGrid, List addHierarchyGrid, List refineFinalGraphGrid, Map> topKGrid, List usePruningGrid) throws IOException { - Catalog catalog = new TestDataSources().addOptionalCatalogs("~/.config/jvector/catalogs" - + ".yaml").catalog(); + TestDataSources testDataSources = new TestDataSources().configure().addOptionalCatalogs("~/.config/jvector/catalogs.yaml"); + Catalog testDataCatalog = testDataSources.catalog(); + DataSetSource dsSource = DataSetSource.DEFAULT.and(loadStreamingDataSource(testDataCatalog)); + var datasetCollection = DatasetCollection.load(); var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).toList(); System.out.println("Executing the following datasets: " + datasetNames); - - DataSetSource datasetSource = DataSetSource.DEFAULT.and(name -> catalog.matchOne(name) - .map(dse -> dse.select().profile(name)).map(TestDataViewWrapper::new)); - for (var datasetName : datasetNames) { DataSet ds = - datasetSource.apply(datasetName).orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName)); + dsSource.apply(datasetName).orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName)); Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid); } } + + @NotNull + private static DataSetSource loadStreamingDataSource(Catalog catalog) { + return name -> { + Optional dsentryOption = catalog.matchOne(name); + if (dsentryOption.isEmpty()) { return Optional.empty(); } + DatasetEntry dsentry = dsentryOption.orElseThrow(() -> new RuntimeException("Unknown dataset: " + name)); + TestDataView tdv = dsentry.select().profile(name); + tdv.getBaseVectors().orElseThrow().prebuffer(); +// tdv.getQueryVectors().orElseThrow().prebuffer(); +// tdv.getNeighborIndices().orElseThrow().prebuffer(); +// tdv.getNeighborDistances().map(DatasetView::prebuffer); + + TestDataViewWrapper tdw = new TestDataViewWrapper(tdv); + return Optional.of(tdw); + }; + } } From ed4416007333f9cee14052ba3b7f574a8628c4a2 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Mon, 25 Aug 2025 16:33:38 +0000 Subject: [PATCH 13/29] add progress indicator --- .../java/io/github/jbellis/jvector/example/Bench.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index f7dbb886a..5556a7a87 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -23,6 +23,7 @@ import io.github.jbellis.jvector.example.yaml.DatasetCollection; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator; import io.nosqlbench.vectordata.discovery.TestDataSources; import io.nosqlbench.vectordata.discovery.TestDataView; import io.nosqlbench.vectordata.downloader.Catalog; @@ -32,6 +33,7 @@ import java.io.IOException; import java.util.*; +import java.util.concurrent.CompletableFuture; import java.util.function.Function; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -112,12 +114,17 @@ private static DataSetSource loadStreamingDataSource(Catalog catalog) { if (dsentryOption.isEmpty()) { return Optional.empty(); } DatasetEntry dsentry = dsentryOption.orElseThrow(() -> new RuntimeException("Unknown dataset: " + name)); TestDataView tdv = dsentry.select().profile(name); - tdv.getBaseVectors().orElseThrow().prebuffer(); + System.out.println("prebuffering dataset (assumed performance oriented testing)"); + CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer(); + if (statusFuture instanceof ProgressIndicator pi) { + pi.monitorProgress(1000); + } // tdv.getQueryVectors().orElseThrow().prebuffer(); // tdv.getNeighborIndices().orElseThrow().prebuffer(); // tdv.getNeighborDistances().map(DatasetView::prebuffer); TestDataViewWrapper tdw = new TestDataViewWrapper(tdv); + System.out.println("Loaded " + tdw.getName() + " from streaming source."); return Optional.of(tdw); }; } From d1e192319bb168b2592d80c9053313072f6f130c Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 28 Aug 2025 02:00:49 +0000 Subject: [PATCH 14/29] update vectordata module version --- jvector-examples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml index f43e0031e..4bd9d2006 100644 --- a/jvector-examples/pom.xml +++ b/jvector-examples/pom.xml @@ -127,7 +127,7 @@ io.nosqlbench vectordata - 0.1.6 + 0.1.7 info.picocli From 519379d4e6440c5116e9c7f67eb65ae0a88b7c43 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 28 Aug 2025 19:29:41 +0000 Subject: [PATCH 15/29] enable diagnostics to fix GHA build --- .github/workflows/unit-tests.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 498afa031..c8a1fc452 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -59,13 +59,13 @@ jobs: - name: Test Panama Support (JDK ${{ matrix.jdk }}) if: matrix.jdk == '20' run: >- - mvn -B -Pjdk20 -pl jvector-tests test -am test + mvn -X -B -Pjdk20 -pl jvector-tests test -am test -DTest_RequireSpecificVectorizationProvider=PanamaVectorizationProvider - name: Verify native-access vector support (JDK ${{ matrix.jdk }}) if: matrix.jdk == '24' run: >- - mvn -B -Punix-amd64-profile -pl jvector-tests -am test + mvn -X -B -Punix-amd64-profile -pl jvector-tests -am test -DTest_RequireSpecificVectorizationProvider=NativeVectorizationProvider -Dsurefire.failIfNoSpecifiedTests=false -Dtest=TestVectorizationProvider @@ -73,7 +73,7 @@ jobs: - name: Compile, run tests and package (JDK ${{ matrix.jdk }}) if: matrix.jdk == '24' run: >- - mvn -B -Punix-amd64-profile -pl jvector-tests -am test + mvn -X -B -Punix-amd64-profile -pl jvector-tests -am test -DTest_RequireSpecificVectorizationProvider=NativeVectorizationProvider - name: Test Summary for (ISA:${{ matrix.isa}},JDK${{ matrix.jdk }}) From a4cf3fa3455cfa79e61aefecfd8a8730be09fd7d Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Tue, 2 Sep 2025 18:40:49 +0000 Subject: [PATCH 16/29] add license to testrig --- testrig | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/testrig b/testrig index 965475242..61a4b524d 100755 --- a/testrig +++ b/testrig @@ -1,4 +1,20 @@ #!/bin/bash + +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + export JAVA_OPTS="--add-modules jdk.incubator.vector" if [ ! -d "jvector-examples/target/classes" -o "$1" = "-r" ] then From 5a632bec71b39880b9612e1e0212005b2bf82dc1 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 4 Sep 2025 20:34:11 +0000 Subject: [PATCH 17/29] make examples J11 compliant, use J11 upstream bytecode --- .../io/github/jbellis/jvector/example/Bench.java | 4 ++-- .../jbellis/jvector/example/TestDataViewWrapper.java | 12 ++++++------ .../jvector/example/testrig/BenchHarness.java | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 5556a7a87..3e40b7aad 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -116,8 +116,8 @@ private static DataSetSource loadStreamingDataSource(Catalog catalog) { TestDataView tdv = dsentry.select().profile(name); System.out.println("prebuffering dataset (assumed performance oriented testing)"); CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer(); - if (statusFuture instanceof ProgressIndicator pi) { - pi.monitorProgress(1000); + if (statusFuture instanceof ProgressIndicator) { + ((ProgressIndicator)statusFuture).monitorProgress(1000); } // tdv.getQueryVectors().orElseThrow().prebuffer(); // tdv.getNeighborIndices().orElseThrow().prebuffer(); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java index 213648443..de0f66fd1 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java @@ -31,12 +31,12 @@ public String getName() { @Override public VectorSimilarityFunction getSimilarityFunction() { var df = view.getDistanceFunction(); - return switch (df) { - case EUCLIDEAN -> VectorSimilarityFunction.EUCLIDEAN; - case COSINE -> VectorSimilarityFunction.COSINE; - case DOT_PRODUCT -> VectorSimilarityFunction.DOT_PRODUCT; - default -> throw new IllegalArgumentException("Unknown distance function " + df); - }; + switch (df) { + case EUCLIDEAN: return VectorSimilarityFunction.EUCLIDEAN; + case COSINE: return VectorSimilarityFunction.COSINE; + case DOT_PRODUCT: return VectorSimilarityFunction.DOT_PRODUCT; + default: throw new IllegalArgumentException("Unknown distance function " + df); + } } @Override diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java index bb28c70ce..57d0259e2 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java @@ -85,8 +85,8 @@ private void smokeTestDataLoad(TestDataView testDataView) { System.out.println("Prebuffering..."); CompletableFuture prebuffer = bv.prebuffer(); - if (prebuffer instanceof ProgressIndicator indicator) { - indicator.monitorProgress(1000); + if (prebuffer instanceof ProgressIndicator) { + ((ProgressIndicator)prebuffer).monitorProgress(1000); } prebuffer.join(); System.out.println("Prebuffered"); From 45eb795e7d2ed817b85438eccf2257ced90eaacc Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 4 Sep 2025 20:56:14 +0000 Subject: [PATCH 18/29] remove vestigial class from partial ranging fix --- .../jvector/quantization/PQLayout.java | 68 ------------------- 1 file changed, 68 deletions(-) delete mode 100644 jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQLayout.java diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQLayout.java b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQLayout.java deleted file mode 100644 index 95350218c..000000000 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQLayout.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright DataStax, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.github.jbellis.jvector.quantization; - -/** - * Chunk Dimensions and Layout - * This is emulative of modern Java records, but keeps to J11 standards. - * This class consolidates the layout calculations for PQ data into one place - */ -public class PQLayout { - - /** total number of vectors **/ - public final int vectorCount; - /** total number of chunks, including any partial **/ - public final int totalChunks; - /** total number of fully-filled chunks **/ - public final int fullSizeChunks; - /** number of vectore per fullSize chunk **/ - public final int fullChunkVectors; - /** number of vectors in last partially filled chunk, if any **/ - public final int lastChunkVectors; - /** compressed dimension of vectors **/ - public final int compressedDimension; - /** number of bytes in each fully-filled chunk **/ - public final int fullChunkBytes; - /** number of bytes in the last partially-filled chunk, if any **/ - public final int lastChunkBytes; - - public PQLayout(int vectorCount, int compressedDimension) { - if (vectorCount < 0) { - throw new IllegalArgumentException("Invalid vector count " + vectorCount); - } - this.vectorCount = vectorCount; - - if (compressedDimension < 0) { - throw new IllegalArgumentException("Invalid compressed dimension " + compressedDimension); - } - this.compressedDimension = compressedDimension; - - long totalSize = (long) vectorCount * compressedDimension; - - this.fullChunkVectors = totalSize <= PQVectors.MAX_CHUNK_SIZE ? vectorCount : PQVectors.MAX_CHUNK_SIZE / compressedDimension; - if (fullChunkVectors == 0) { - throw new IllegalArgumentException("Compressed dimension " + compressedDimension + " too large for chunking"); - } - this.lastChunkVectors = vectorCount % this.fullChunkVectors; - - this.fullChunkBytes = Math.multiplyExact(compressedDimension, this.fullChunkVectors); - this.lastChunkBytes = Math.multiplyExact(compressedDimension, lastChunkVectors); - - this.fullSizeChunks = vectorCount / fullChunkVectors; - this.totalChunks = fullSizeChunks + ((vectorCount % fullChunkVectors == 0) ? 0 : 1); - } -} From 17c65bec09ecb90e4b57a0f6a291099d78443953 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 4 Sep 2025 22:24:07 +0000 Subject: [PATCH 19/29] update to J11 upstream dep --- jvector-examples/pom.xml | 16 ++++++-------- .../github/jbellis/jvector/example/Bench.java | 2 +- .../jbellis/jvector/example/BenchYAML.java | 22 ++++++++++++++++--- .../jvector/example/testrig/BenchHarness.java | 2 +- .../example/testrig/commands/Bench_CMD.java | 3 ++- .../example/testrig/commands/Run_CMD.java | 3 ++- 6 files changed, 32 insertions(+), 16 deletions(-) diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml index 4bd9d2006..9a4091276 100644 --- a/jvector-examples/pom.xml +++ b/jvector-examples/pom.xml @@ -13,6 +13,7 @@ JVector Examples 2.21.10 + 0.1.8 @@ -127,18 +128,18 @@ io.nosqlbench vectordata - 0.1.7 + ${vectordata.version} + + + io.nosqlbench + nbvectors + ${vectordata.version} info.picocli picocli 4.7.6 - - io.nosqlbench - nbvectors - 0.1.6 - com.kohlschutter.junixsocket @@ -199,9 +200,6 @@ jdk20 - - 20 - io.github.jbellis diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 3e40b7aad..67fdba468 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -96,7 +96,7 @@ private static void execute(Pattern pattern, List pattern.matcher(dn).find()).toList(); + var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); System.out.println("Executing the following datasets: " + datasetNames); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index cbe9e5a34..6944e14bb 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -79,7 +79,16 @@ public static void main(String[] args) throws IOException { } } - for (var config : allConfigs) { + // TODO: Reconcile these flows +// for (var datasetName : datasetNames) { +// String finalDatasetName = datasetName; +// DataSet ds = datasetSource.apply(datasetName) +// .orElseThrow(() -> new IllegalArgumentException( +// "Unknown dataset: " + finalDatasetName)); +// // DataSet ds = DataSetLoader.loadDataSet(datasetName); + + + for (var config : allConfigs) { String datasetName = config.dataset; DataSet ds = datasetSource.apply(datasetName) @@ -88,10 +97,17 @@ public static void main(String[] args) throws IOException { // DataSet ds = DataSetLoader.loadDataSet(datasetName); DataSet ds = DataSetLoader.loadDataSet(datasetName); - Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction, + // TODO: Reconcile these flows + +// if (datasetName.endsWith(".hdf5")) { +// datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length()); +// } +// MultiConfig config = MultiConfig.getDefaultConfig(datasetName); + + Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction, config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph, config.construction.getFeatureSets(), config.construction.getCompressorParameters(), - config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning, config.search.benchmarks); + config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning); } } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java index 57d0259e2..2171ef7b0 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java @@ -70,7 +70,7 @@ public BenchHarness( this.datasetEntry = datasetEntry; this.profile = profile; this.concurrency = concurrency; - this.virtualThreadExecutor = Executors.newVirtualThreadPerTaskExecutor(); + this.virtualThreadExecutor = Executors.newCachedThreadPool(); this.semaphore = new Semaphore(concurrency); } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java index 33bb9b6c5..44c70be5d 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java @@ -11,6 +11,7 @@ import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; +import java.util.stream.Collectors; @CommandLine.Command(name = "bench", description = "run example benchmarks") public class Bench_CMD implements Callable { @@ -19,7 +20,7 @@ public static class ExpanderExample implements Iterable { @Override public java.util.Iterator iterator() { return VectorTestData.catalogs().catalog().datasets().stream().map(DatasetEntry::name) - .toList().iterator(); + .collect(Collectors.toList()).iterator(); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java index 93cdaa83e..b0cebb69d 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java @@ -11,6 +11,7 @@ import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; +import java.util.stream.Collectors; @CommandLine.Command(name = "run", description = "Run a testrig command") public class Run_CMD implements Callable { @@ -19,7 +20,7 @@ public static class ExpanderExample implements Iterable { @Override public java.util.Iterator iterator() { return VectorTestData.catalogs().catalog().datasets().stream().map(DatasetEntry::name) - .toList().iterator(); + .collect(Collectors.toList()).iterator(); } } From a8c2bfed9187aa2660b84c7899fe41c3fe7cc01e Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 4 Sep 2025 22:33:16 +0000 Subject: [PATCH 20/29] exclude local files and add licenses --- .gitignore | 2 ++ .../jvector/example/TestDataViewWrapper.java | 16 ++++++++++++++++ .../jvector/example/testrig/BenchHarness.java | 16 ++++++++++++++++ .../example/testrig/commands/Bench_CMD.java | 16 ++++++++++++++++ .../example/testrig/commands/Run_CMD.java | 16 ++++++++++++++++ .../example/testrig/commands/TestRig_CMD.java | 16 ++++++++++++++++ .../jvector/example/util/DataSetSource.java | 16 ++++++++++++++++ .../example/util/FloatVectorsWrapper.java | 16 ++++++++++++++++ rat-excludes.txt | 2 +- 9 files changed, 115 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f4e7a5bc9..b651cd511 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,6 @@ hdf5/ dependency-reduced-pom.xml results.csv +# Local testing files +local/** diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java index de0f66fd1..8ee51aca1 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java @@ -1,3 +1,19 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.github.jbellis.jvector.example; import io.github.jbellis.jvector.example.util.DataSet; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java index 2171ef7b0..21cdf571d 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java @@ -1,3 +1,19 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.github.jbellis.jvector.example.testrig; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java index 44c70be5d..64f8bef08 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java @@ -1,3 +1,19 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.github.jbellis.jvector.example.testrig.commands; import io.github.jbellis.jvector.example.testrig.BenchHarness; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java index b0cebb69d..7597da8f6 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java @@ -1,3 +1,19 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.github.jbellis.jvector.example.testrig.commands; import io.github.jbellis.jvector.example.testrig.BenchHarness; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java index 1c4f823f9..29e440cf5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java @@ -1,3 +1,19 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.github.jbellis.jvector.example.testrig.commands; import io.nosqlbench.command.datasets.CMD_datasets; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java index f08b11c7d..9743e66ad 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java @@ -1,3 +1,19 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.github.jbellis.jvector.example.util; import java.util.Optional; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java index d4b49e60e..3b0447d44 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java @@ -1,3 +1,19 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.github.jbellis.jvector.example.util; import io.github.jbellis.jvector.graph.RandomAccessVectorValues; diff --git a/rat-excludes.txt b/rat-excludes.txt index e9dd9fb37..3dd5164ca 100644 --- a/rat-excludes.txt +++ b/rat-excludes.txt @@ -25,4 +25,4 @@ results.csv scripts/test_node_setup.sh scripts/jmh_results_formatter.py yaml-configs/*.yml -src/main/resources/logback.xml \ No newline at end of file +local/* \ No newline at end of file From 6aafe060d96a58929fd9e0c7da6b9a1f260baff4 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 9 Oct 2025 11:20:53 -0500 Subject: [PATCH 21/29] realigning to main --- .../bench/PQDistanceCalculationBenchmark.java | 8 ++-- jvector-examples/pom.xml | 25 ++-------- .../jvector/example/AutoBenchYAML.java | 47 +++++++++++++++---- .../jbellis/jvector/example/BenchYAML.java | 46 ++++++++---------- .../github/jbellis/jvector/example/Grid.java | 7 +-- .../jvector/example/testrig/BenchHarness.java | 36 -------------- .../example/testrig/commands/Bench_CMD.java | 3 +- .../example/util/BenchmarkSummarizerTest.java | 2 +- pom.xml | 7 +++ 9 files changed, 78 insertions(+), 103 deletions(-) diff --git a/benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java b/benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java index 59342e41a..f0e07c623 100644 --- a/benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java +++ b/benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java @@ -23,15 +23,13 @@ import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.quantization.ProductQuantization; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; -import io.github.jbellis.jvector.vector.VectorUtil; import io.github.jbellis.jvector.vector.VectorizationProvider; -import io.github.jbellis.jvector.vector.types.ByteSequence; import io.github.jbellis.jvector.vector.types.VectorFloat; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.openjdk.jmh.annotations.*; import org.openjdk.jmh.infra.Blackhole; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; @@ -49,7 +47,7 @@ @Measurement(iterations = 3) @Threads(1) public class PQDistanceCalculationBenchmark { - private static final Logger log = LoggerFactory.getLogger(PQDistanceCalculationBenchmark.class); + private static final Logger log = LogManager.getLogger(PQDistanceCalculationBenchmark.class); private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); private final VectorSimilarityFunction vsf = VectorSimilarityFunction.EUCLIDEAN; diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml index 9a4091276..111ecf679 100644 --- a/jvector-examples/pom.xml +++ b/jvector-examples/pom.xml @@ -13,26 +13,11 @@ JVector Examples 2.21.10 - 0.1.8 + 0.1.10 - - - - - - - - - - - - - - - org.codehaus.mojo exec-maven-plugin @@ -127,13 +112,13 @@ io.nosqlbench - vectordata - ${vectordata.version} + datatools-vectordata + ${datatools.version} io.nosqlbench - nbvectors - ${vectordata.version} + datatools-nbvectors + ${datatools.version} info.picocli diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java index 86dc74659..479ce331a 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java @@ -17,26 +17,27 @@ package io.github.jbellis.jvector.example; import com.fasterxml.jackson.databind.ObjectMapper; -import io.github.jbellis.jvector.example.util.BenchmarkSummarizer; +import io.github.jbellis.jvector.example.util.*; import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; -import io.github.jbellis.jvector.example.util.CheckpointManager; -import io.github.jbellis.jvector.example.util.DataSet; -import io.github.jbellis.jvector.example.util.DataSetLoader; import io.github.jbellis.jvector.example.yaml.ConstructionParameters; import io.github.jbellis.jvector.example.yaml.MultiConfig; import io.github.jbellis.jvector.example.yaml.SearchParameters; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; +import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator; +import io.nosqlbench.vectordata.discovery.TestDataSources; +import io.nosqlbench.vectordata.discovery.TestDataView; +import io.nosqlbench.vectordata.downloader.Catalog; +import io.nosqlbench.vectordata.downloader.DatasetEntry; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileWriter; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; +import java.util.*; +import java.util.concurrent.CompletableFuture; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -122,6 +123,13 @@ public static void main(String[] args) throws IOException { // Add results from checkpoint if present results.addAll(checkpointManager.getCompletedResults()); + Catalog testDataCatalog = new TestDataSources().configure() + .addOptionalCatalogs("~/.config/vectordata/catalogs.yaml") + .catalog(); + + DataSetSource datasetSource = DataSetSource.DEFAULT + .and(loadStreamingDataSource(testDataCatalog)); + // Process datasets from regex patterns if (!datasetNames.isEmpty()) { for (var datasetName : datasetNames) { @@ -133,8 +141,9 @@ public static void main(String[] args) throws IOException { logger.info("Loading dataset: {}", datasetName); try { - DataSet ds = DataSetLoader.loadDataSet(datasetName); - logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.baseVectors.size()); + DataSet ds = datasetSource.apply(datasetName) + .orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName));; + logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size()); String normalizedDatasetName = datasetName; if (normalizedDatasetName.endsWith(".hdf5")) { @@ -212,4 +221,22 @@ public static void main(String[] args) throws IOException { } } + @NotNull + public static DataSetSource loadStreamingDataSource(Catalog catalog) { + return name -> { + Optional dsentryOption = catalog.matchOne(name); + if (dsentryOption.isEmpty()) { return Optional.empty(); } + DatasetEntry dsentry = dsentryOption.orElseThrow(() -> new RuntimeException("Unknown dataset: " + name)); + TestDataView tdv = dsentry.select().profile(name); + System.out.println("prebuffering dataset (assumed performance oriented testing)"); + CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer(); + if (statusFuture instanceof ProgressIndicator) { + ((ProgressIndicator)statusFuture).monitorProgress(1000); + } + + TestDataViewWrapper tdw = new TestDataViewWrapper(tdv); + System.out.println("Loaded " + tdw.getName() + " from streaming source, with base vectors prebuffered"); + return Optional.of(tdw); + }; + } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index 6944e14bb..d44f916ee 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -21,6 +21,8 @@ import io.github.jbellis.jvector.example.util.DataSetSource; import io.github.jbellis.jvector.example.yaml.DatasetCollection; import io.github.jbellis.jvector.example.yaml.MultiConfig; +import io.nosqlbench.vectordata.discovery.TestDataSources; +import io.nosqlbench.vectordata.downloader.Catalog; import java.io.IOException; import java.util.ArrayList; @@ -46,7 +48,12 @@ public static void main(String[] args) throws IOException { var pattern = Pattern.compile(regex); var datasetCollection = DatasetCollection.load(); - DataSetSource datasetSource = DataSetLoader.DEFAULT; + Catalog testDataCatalog = new TestDataSources().configure() + .addOptionalCatalogs("~/.config/vectordata/catalogs.yaml") + .catalog(); + DataSetSource datasetSource = DataSetSource.DEFAULT + .and(AutoBenchYAML.loadStreamingDataSource(testDataCatalog)); + var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); List allConfigs = new ArrayList<>(); @@ -56,6 +63,7 @@ public static void main(String[] args) throws IOException { for (var datasetName : datasetNames) { String finalDatasetName = datasetName; + DataSet ds = datasetSource.apply(datasetName) .orElseThrow(() -> new IllegalArgumentException( "Unknown dataset: " + finalDatasetName)); @@ -79,35 +87,19 @@ public static void main(String[] args) throws IOException { } } - // TODO: Reconcile these flows -// for (var datasetName : datasetNames) { -// String finalDatasetName = datasetName; -// DataSet ds = datasetSource.apply(datasetName) -// .orElseThrow(() -> new IllegalArgumentException( -// "Unknown dataset: " + finalDatasetName)); -// // DataSet ds = DataSetLoader.loadDataSet(datasetName); - - - for (var config : allConfigs) { - String datasetName = config.dataset; - - DataSet ds = datasetSource.apply(datasetName) - .orElseThrow(() -> new IllegalArgumentException( - "Unknown dataset: " + datasetName)); - // DataSet ds = DataSetLoader.loadDataSet(datasetName); - DataSet ds = DataSetLoader.loadDataSet(datasetName); + // Execute tests for all the mapped datasets and configs - // TODO: Reconcile these flows + for (var config : allConfigs) { + final String datasetName = config.dataset; -// if (datasetName.endsWith(".hdf5")) { -// datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length()); -// } -// MultiConfig config = MultiConfig.getDefaultConfig(datasetName); + DataSet ds = datasetSource.apply(datasetName) + .orElseThrow(() -> new IllegalArgumentException( + "Unknown dataset: " + datasetName)); - Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction, - config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph, - config.construction.getFeatureSets(), config.construction.getCompressorParameters(), - config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning); + Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction, + config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph, + config.construction.getFeatureSets(), config.construction.getCompressorParameters(), + config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning); } } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java index a2c8f0d61..c7095668f 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java @@ -277,7 +277,7 @@ private static Map, ImmutableGraphIndex> buildOnDisk(List, ImmutableGraphIndex> indexes = new HashMap<>(); @@ -590,9 +590,10 @@ public static List runAllAndCollectResults( ); for (Metric metric : metricsList) { Map metrics = java.util.Map.of(metric.getHeader(), metric.getValue()); - results.add(new BenchResult(ds.name, params, metrics)); + results.add(new BenchResult(ds.getName(), params, metrics)); } - results.add(new BenchResult(ds.name, params, Map.of("Index Build Time", indexBuildTimes.get(ds.name)))); + results.add(new BenchResult(ds.getName(), params, Map.of("Index " + + "Build Time", indexBuildTimes.get(ds.getName())))); } } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java index 21cdf571d..1fb786969 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java @@ -17,48 +17,12 @@ package io.github.jbellis.jvector.example.testrig; -import io.github.jbellis.jvector.disk.ReaderSupplierFactory; -import io.github.jbellis.jvector.example.Grid; -import io.github.jbellis.jvector.example.util.CompressorParameters; -import io.github.jbellis.jvector.example.util.DataSet; -import io.github.jbellis.jvector.example.util.Hdf5Loader; -import io.github.jbellis.jvector.graph.GraphIndex; -import io.github.jbellis.jvector.graph.GraphIndexBuilder; -import io.github.jbellis.jvector.graph.OnHeapGraphIndex; -import io.github.jbellis.jvector.graph.RandomAccessVectorValues; -import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; -import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndexWriter; -import io.github.jbellis.jvector.graph.disk.OrdinalMapper; -import io.github.jbellis.jvector.graph.disk.feature.Feature; -import io.github.jbellis.jvector.graph.disk.feature.FeatureId; -import io.github.jbellis.jvector.graph.disk.feature.FusedADC; -import io.github.jbellis.jvector.graph.disk.feature.InlineVectors; -import io.github.jbellis.jvector.graph.disk.feature.NVQ; -import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider; -import io.github.jbellis.jvector.quantization.CompressedVectors; -import io.github.jbellis.jvector.quantization.NVQuantization; -import io.github.jbellis.jvector.quantization.PQVectors; -import io.github.jbellis.jvector.quantization.ProductQuantization; -import io.github.jbellis.jvector.quantization.VectorCompressor; -import io.github.jbellis.jvector.util.PhysicalCoreExecutor; import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator; -import io.nosqlbench.vectordata.VectorTestData; import io.nosqlbench.vectordata.discovery.TestDataView; import io.nosqlbench.vectordata.downloader.DatasetEntry; import io.nosqlbench.vectordata.spec.datasets.types.BaseVectors; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.Arrays; -import java.util.EnumMap; -import java.util.EnumSet; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; import java.util.concurrent.*; public class BenchHarness implements Runnable { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java index 64f8bef08..2665f46a6 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java @@ -91,7 +91,8 @@ public Integer call() throws Exception { System.out.println("Using profile " + _profile); Catalog catalog = new TestDataSources().addOptionalCatalogs(optionalCatalogs).catalog(); - DatasetEntry ds = catalog.findExact(_name).orElseThrow(); + DatasetEntry ds = catalog. + findExact(_name).orElseThrow(); // VectorTestData.catalogs().catalog().findExact(_name).orElseThrow(); BenchHarness harness = new BenchHarness(ds, _profile, concurrency); diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java index 6168d5dca..e52ce78b7 100644 --- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java @@ -17,7 +17,7 @@ import io.github.jbellis.jvector.example.BenchResult; import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.HashMap; diff --git a/pom.xml b/pom.xml index ec2d326c9..9b294e798 100644 --- a/pom.xml +++ b/pom.xml @@ -198,10 +198,17 @@ + + org.junit.jupiter + junit-jupiter-api + 5.9.1 + test + org.junit.jupiter junit-jupiter-engine 5.9.1 + test com.carrotsearch.randomizedtesting From b5746ad6b4c85f642fb4869ec9774f40a3816bf1 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 9 Oct 2025 13:39:58 -0500 Subject: [PATCH 22/29] fix logback inclusion and exclude inactive module target dirs from rat --- .../src/main/resources/logback.xml | 19 ------------------- rat-excludes.txt | 3 ++- 2 files changed, 2 insertions(+), 20 deletions(-) delete mode 100644 jvector-examples/src/main/resources/logback.xml diff --git a/jvector-examples/src/main/resources/logback.xml b/jvector-examples/src/main/resources/logback.xml deleted file mode 100644 index 0a7d8846a..000000000 --- a/jvector-examples/src/main/resources/logback.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - true - - %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n - - - - - - - - - - - diff --git a/rat-excludes.txt b/rat-excludes.txt index 3dd5164ca..b858fc86e 100644 --- a/rat-excludes.txt +++ b/rat-excludes.txt @@ -25,4 +25,5 @@ results.csv scripts/test_node_setup.sh scripts/jmh_results_formatter.py yaml-configs/*.yml -local/* \ No newline at end of file +local/* +*/target/* \ No newline at end of file From 11ce260e2dc9ce6a90469cf65a2d354920236fd9 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Fri, 10 Oct 2025 15:15:48 -0500 Subject: [PATCH 23/29] bench refactorings phase 1 --- jvector-examples/pom.xml | 2 +- .../jvector/benchframe/BenchFrame.java | 601 ++++++++++++++++++ .../jvector/benchframe/BenchFrameCLI.java | 227 +++++++ .../jvector/benchframe/BenchFrameConfig.java | 490 ++++++++++++++ .../jvector/benchframe/BenchResult.java | 84 +++ .../benchframe/CheckpointStrategy.java | 177 ++++++ .../jvector/benchframe/ResultHandler.java | 201 ++++++ .../TestDataViewWrapper.java | 2 +- .../jvector/benchframe/package-info.java | 176 +++++ .../jvector/example/AutoBenchYAML.java | 237 ++----- .../github/jbellis/jvector/example/Bench.java | 112 +--- .../jbellis/jvector/example/Bench2D.java | 65 -- .../jbellis/jvector/example/BenchResult.java | 31 - .../jbellis/jvector/example/BenchYAML.java | 86 +-- .../github/jbellis/jvector/example/Grid.java | 3 +- .../example/util/BenchmarkSummarizer.java | 2 +- .../example/util/CheckpointManager.java | 15 +- .../jvector/example/util/DataSetLoader.java | 90 ++- .../jvector/example/util/DataSetSource.java | 3 +- .../example/util/BenchmarkSummarizerTest.java | 2 +- .../jvector/example/util/SummarizerTest.java | 2 +- 21 files changed, 2130 insertions(+), 478 deletions(-) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java rename jvector-examples/src/main/java/io/github/jbellis/jvector/{example => benchframe}/TestDataViewWrapper.java (98%) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java delete mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java delete mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml index 111ecf679..ad01ae381 100644 --- a/jvector-examples/pom.xml +++ b/jvector-examples/pom.xml @@ -42,7 +42,7 @@ true - io.github.jbellis.jvector.example.testrig.commands.TestRig_CMD + io.github.jbellis.jvector.benchframe.BenchFrameCLI diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java new file mode 100644 index 000000000..6dd956be5 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java @@ -0,0 +1,601 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import io.github.jbellis.jvector.example.*; +import io.github.jbellis.jvector.example.util.DataSet; +import io.github.jbellis.jvector.example.util.DataSetSource; +import io.github.jbellis.jvector.example.yaml.DatasetCollection; +import io.github.jbellis.jvector.example.yaml.MultiConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import picocli.CommandLine; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.function.Function; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Unified benchmark framework that consolidates functionality from Bench, BenchYAML, and AutoBenchYAML. + * Provides a modular, composable architecture using the strategy pattern for different components including + * configuration sources, result handlers, and checkpointing strategies. + *

+ * This class serves as the main orchestrator for JVector graph index benchmarks, supporting multiple execution + * modes from simple hardcoded configurations to complex CI/CD scenarios with checkpointing and automated result + * collection. + * + *

Environment Variables

+ *
    + *
  • {@code VECTORDATA_CATALOGS} - Comma-separated list of additional catalog YAML files to load + * (e.g., "~/.config/custom1/catalogs.yaml,~/.config/custom2/catalogs.yaml")
  • + *
+ * + *

Usage Examples

+ *

Command-Line Interface (Recommended)

+ *
{@code
+ * // Run with hardcoded parameters (Bench-style)
+ * BenchFrame.main(new String[]{"bench", "dataset-name"});
+ *
+ * // Run with YAML configuration (BenchYAML-style)
+ * BenchFrame.main(new String[]{"benchyaml", "dataset-name"});
+ *
+ * // Run in CI/CD mode with checkpointing (AutoBenchYAML-style)
+ * BenchFrame.main(new String[]{"autobenchyaml", "-o", "output", "dataset-name"});
+ *
+ * // List available datasets
+ * BenchFrame.main(new String[]{"datasets"});
+ *
+ * // Access nbvectors CLI
+ * BenchFrame.main(new String[]{"nbvectors", "--help"});
+ * }
+ * + *

Programmatic Usage - Convenience Methods

+ *
{@code
+ * // Use hardcoded defaults
+ * BenchFrame.likeBench().execute(args);
+ *
+ * // Use YAML configuration
+ * BenchFrame.likeBenchYAML().execute(args);
+ *
+ * // Use CI/CD mode with checkpointing
+ * BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(args);
+ * }
+ * + *

Advanced - Custom Configuration with Builder

+ *
{@code
+ * // Use a single config for all datasets
+ * new BenchFrame.Builder()
+ *     .withDatasetNames(List.of("my-dataset", "another-dataset"))
+ *     .withConfig(BenchFrameConfig.createBenchDefaults())
+ *     .withDataSetSource(DataSetSource.DEFAULT)
+ *     .withResultHandler(ResultHandler.consoleOnly())
+ *     .build()
+ *     .execute(args);
+ *
+ * // Or use a function for per-dataset config (like YAML)
+ * new BenchFrame.Builder()
+ *     .withDatasetNames(List.of("dataset1", "dataset2"))
+ *     .withConfigFunction(name -> loadYamlConfig(name))
+ *     .build()
+ *     .execute(args);
+ * }
+ * + *

For Synthetic 2D Datasets (Programmatic Only)

+ *
{@code
+ * // Create and benchmark a 2D grid programmatically
+ * var grid2d = DataSetCreator.create2DGrid(4_000_000, 10_000, 100);
+ * BenchFrame.likeBench().execute(grid2d);
+ * }
+ * + * @see BenchFrameConfig + * @see ResultHandler + * @see CheckpointStrategy + * @see BenchFrameCLI + */ +public class BenchFrame { + private static final Logger logger = LoggerFactory.getLogger(BenchFrame.class); + + private final List datasetNames; + private final BenchFrameConfig config; + private final Function configFunction; + private final DataSetSource dataSetSource; + private final ResultHandler resultHandler; + private final CheckpointStrategy checkpointStrategy; + private final boolean collectResults; + private final int diagnosticLevel; + + private BenchFrame(Builder builder) { + this.datasetNames = builder.datasetNames; + this.config = builder.config; + this.configFunction = builder.configFunction; + this.dataSetSource = builder.dataSetSource; + this.resultHandler = builder.resultHandler; + this.checkpointStrategy = builder.checkpointStrategy; + this.collectResults = builder.collectResults; + this.diagnosticLevel = builder.diagnosticLevel; + } + + /** + * Executes the benchmark with a pre-created dataset. This method is primarily used by the Bench2D workflow + * for synthetic 2D datasets but can be used programmatically with any {@link DataSet} instance. + *

+ * The execution includes: + *

    + *
  • Setting diagnostic level if configured
  • + *
  • Loading configuration for the dataset name
  • + *
  • Running the benchmark grid with configured parameters
  • + *
  • Handling results through the configured {@link ResultHandler}
  • + *
+ * + * @param dataset the pre-created dataset to benchmark + * @throws IOException if benchmark execution fails or result writing encounters I/O errors + * @throws RuntimeException if the dataset configuration cannot be loaded + */ + public void execute(DataSet dataset) throws IOException { + if (diagnosticLevel > 0) { + Grid.setDiagnosticLevel(diagnosticLevel); + } + + logger.info("Executing benchmark for pre-created dataset: {}", dataset.getName()); + + try { + BenchFrameConfig datasetConfig = getConfigForDataset(dataset.getName()); + List results = executeBenchmark(dataset, datasetConfig); + + resultHandler.handleResults(results); + logger.info("Benchmark execution complete"); + } catch (Exception e) { + logger.error("Failed to process dataset: {}", dataset.getName(), e); + throw new RuntimeException("Benchmark failed for dataset: " + dataset.getName(), e); + } + } + + /** + * Executes the benchmark with the given command-line arguments. This is the primary entry point for + * benchmarking one or more datasets by name pattern. + *

+ * The execution flow includes: + *

    + *
  1. Setting diagnostic level if configured
  2. + *
  3. Building a regex pattern from the provided arguments
  4. + *
  5. Filtering datasets by the pattern
  6. + *
  7. Loading previous results from checkpoint if checkpoint strategy is enabled
  8. + *
  9. For each matched dataset: + *
      + *
    • Checking if dataset should be skipped (already completed in checkpoint)
    • + *
    • Loading the dataset from the configured {@link DataSetSource}
    • + *
    • Loading configuration from the configured {@link BenchFrameConfigSource}
    • + *
    • Executing the benchmark
    • + *
    • Recording completion in checkpoint if enabled
    • + *
    + *
  10. + *
  11. Handling all results through the configured {@link ResultHandler}
  12. + *
+ * + * @param args command-line arguments, typically dataset name patterns. Multiple patterns are OR'd together. + * If empty, matches all datasets. Patterns support standard Java regex syntax. + * @throws IOException if dataset loading, benchmark execution, or result writing encounters I/O errors + * @throws RuntimeException if a dataset cannot be loaded or configuration cannot be retrieved + */ + public void execute(String[] args) throws IOException { + if (diagnosticLevel > 0) { + Grid.setDiagnosticLevel(diagnosticLevel); + } + + Pattern pattern = buildPattern(args); + List matchedDatasets = filterDatasets(datasetNames, pattern); + + if (matchedDatasets.isEmpty()) { + logger.warn("No datasets matched pattern: {}", pattern.pattern()); + return; + } + + logger.info("Executing benchmarks for datasets: {}", matchedDatasets); + + List allResults = new ArrayList<>(checkpointStrategy.getPreviousResults()); + + for (String datasetName : matchedDatasets) { + if (checkpointStrategy.shouldSkipDataset(datasetName)) { + logger.info("Skipping already completed dataset: {}", datasetName); + continue; + } + + logger.info("Loading dataset: {}", datasetName); + try { + DataSet dataset = dataSetSource.apply(datasetName) + .orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName)); + + BenchFrameConfig datasetConfig = getConfigForDataset(datasetName); + List datasetResults = executeBenchmark(dataset, datasetConfig); + + allResults.addAll(datasetResults); + checkpointStrategy.recordCompletion(datasetName, datasetResults); + + logger.info("Completed benchmark for dataset: {}", datasetName); + } catch (Exception e) { + logger.error("Failed to process dataset: {}", datasetName, e); + throw new RuntimeException("Benchmark failed for dataset: " + datasetName, e); + } + } + + resultHandler.handleResults(allResults); + logger.info("Benchmark execution complete"); + } + + /** + * Gets the configuration for a specific dataset. Uses configFunction if provided (for per-dataset config), + * otherwise uses the single shared config. + * + * @param datasetName the dataset name + * @return configuration for the dataset + */ + private BenchFrameConfig getConfigForDataset(String datasetName) { + if (configFunction != null) { + return configFunction.apply(datasetName); + } else { + // Use shared config, but set the dataset name + return config.toBuilder() + .withDatasetName(datasetName) + .build(); + } + } + + /** + * Executes the benchmark for a single dataset with the provided configuration. This method delegates + * to {@link Grid} for the actual benchmark execution. + * + * @param dataset the dataset to benchmark + * @param config the configuration specifying grid parameters and benchmark settings + * @return list of {@link BenchResult} objects if result collection is enabled, empty list otherwise + * @throws IOException if benchmark execution encounters I/O errors + */ + private List executeBenchmark(DataSet dataset, BenchFrameConfig config) throws IOException { + if (collectResults) { + return Grid.runAllAndCollectResults( + dataset, + config.getMGrid(), + config.getEfConstructionGrid(), + config.getNeighborOverflowGrid(), + config.getAddHierarchyGrid(), + config.getFeatureSets(), + config.getBuildCompressors(), + config.getSearchCompressors(), + config.getTopKOverqueryGrid(), + config.getUsePruningGrid() + ); + } else { + Grid.runAll( + dataset, + config.getMGrid(), + config.getEfConstructionGrid(), + config.getNeighborOverflowGrid(), + config.getAddHierarchyGrid(), + config.getRefineFinalGraphGrid(), + config.getFeatureSets(), + config.getBuildCompressors(), + config.getSearchCompressors(), + config.getTopKOverqueryGrid(), + config.getUsePruningGrid(), + config.getBenchmarkSpec() + ); + return List.of(); + } + } + + /** + * Builds a regex pattern from command-line arguments. Multiple patterns are OR'd together. + * Arguments can contain space-separated patterns that are split and combined. + *

+ * Examples: + *

    + *
  • Empty args: matches everything (".*")
  • + *
  • {"dataset1"}: matches "dataset1"
  • + *
  • {"dataset1", "dataset2"}: matches "dataset1" OR "dataset2"
  • + *
  • {"dataset1 dataset2"}: matches "dataset1" OR "dataset2" (space-split)
  • + *
+ * + * @param args command-line arguments containing dataset name patterns + * @return compiled regex pattern for dataset filtering + */ + private static Pattern buildPattern(String[] args) { + var regex = args.length == 0 ? ".*" + : Arrays.stream(args) + .flatMap(s -> Arrays.stream(s.split("\\s"))) + .map(s -> "(?:" + s + ")") + .collect(Collectors.joining("|")); + return Pattern.compile(regex); + } + + /** + * Filters dataset names by regex pattern using partial matching (find, not full match). + * + * @param datasets the list of dataset names to filter + * @param pattern the regex pattern to match against + * @return list of dataset names where the pattern was found + */ + private static List filterDatasets(List datasets, Pattern pattern) { + return datasets.stream() + .filter(name -> pattern.matcher(name).find()) + .collect(Collectors.toList()); + } + + /** + * Creates a BenchFrame configured like the original Bench.java with hardcoded grid parameters. + * This factory method provides compatibility with the legacy Bench class behavior. + *

+ * Configuration includes: + *

    + *
  • Datasets loaded from {@link DatasetCollection}
  • + *
  • Hardcoded default grid parameters (M=32, efConstruction=100, etc.)
  • + *
  • Console-only output (no file writing)
  • + *
  • No checkpointing
  • + *
+ * + * @return a BenchFrame instance configured with hardcoded defaults + * @throws UncheckedIOException if the dataset collection cannot be loaded + */ + public static BenchFrame likeBench() { + try { + return new Builder() + .withDatasetNames(DatasetCollection.load().getAll()) + .withConfig(BenchFrameConfig.createBenchDefaults()) + .withDataSetSource(DataSetSource.DEFAULT) + .withResultHandler(ResultHandler.consoleOnly()) + .build(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to load dataset collection", e); + } + } + + /** + * Creates a BenchFrame configured like the original BenchYAML.java with YAML-based configuration. + * This factory method provides compatibility with the legacy BenchYAML class behavior. + *

+ * Configuration includes: + *

    + *
  • Datasets loaded from {@link DatasetCollection}
  • + *
  • Parameters loaded from YAML configuration files per dataset
  • + *
  • Console-only output (no file writing)
  • + *
  • No checkpointing
  • + *
+ * + * @return a BenchFrame instance configured to load parameters from YAML + * @throws UncheckedIOException if the dataset collection cannot be loaded + */ + public static BenchFrame likeBenchYAML() { + try { + return new Builder() + .withDatasetNames(DatasetCollection.load().getAll()) + .withConfigFunction(datasetName -> { + try { + MultiConfig multiConfig = MultiConfig.getDefaultConfig(datasetName); + return BenchFrameConfig.fromMultiConfig(multiConfig); + } catch (FileNotFoundException e) { + throw new RuntimeException("Failed to load YAML config for dataset: " + datasetName, e); + } + }) + .withDataSetSource(DataSetSource.DEFAULT) + .withResultHandler(ResultHandler.consoleOnly()) + .build(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to load dataset collection", e); + } + } + + /** + * Creates a BenchFrame configured like the original AutoBenchYAML.java for CI/CD scenarios. + * This factory method provides compatibility with the legacy AutoBenchYAML class behavior + * with additional support for checkpointing and file-based output. + *

+ * Configuration includes: + *

    + *
  • Hardcoded dataset list for CI/CD: cap-1M, cap-6M, cohere-english-v3-1M, + * cohere-english-v3-10M, dpr-1M, dpr-10M
  • + *
  • Parameters loaded from autoDefault YAML configuration
  • + *
  • File-based output: CSV summary and JSON details
  • + *
  • File-based checkpointing to support resumption after failures
  • + *
  • Result collection enabled
  • + *
  • Configurable diagnostic level
  • + *
+ * + * @param outputPath base path for output files (.csv, .json, .checkpoint.json) + * @param diagnosticLevel diagnostic level controlling Grid output verbosity + * (0=none, 1=basic, 2=detailed, 3=verbose) + * @return a BenchFrame instance configured for CI/CD with checkpointing + * @see ResultHandler#toFiles(String) + * @see CheckpointStrategy#fileBasedCheckpointing(String) + */ + public static BenchFrame likeAutoBenchYAML(String outputPath, int diagnosticLevel) { + // Hardcoded list for CI/CD (matches original AutoBenchYAML) + List datasets = Arrays.asList( + "cap-1M", "cap-6M", + "cohere-english-v3-1M", "cohere-english-v3-10M", + "dpr-1M", "dpr-10M" + ); + + try { + MultiConfig multiConfig = MultiConfig.getDefaultConfig("autoDefault"); + BenchFrameConfig baseConfig = BenchFrameConfig.fromMultiConfig(multiConfig) + .toBuilder() + .collectResults(true) + .build(); + + return new Builder() + .withDatasetNames(datasets) + .withConfig(baseConfig) + .withDataSetSource(DataSetSource.DEFAULT) + .withResultHandler(ResultHandler.toFiles(outputPath)) + .withCheckpointStrategy(CheckpointStrategy.fileBasedCheckpointing(outputPath)) + .withDiagnosticLevel(diagnosticLevel) + .build(); + } catch (FileNotFoundException e) { + throw new RuntimeException("Failed to load autoDefault YAML config", e); + } + } + + /** + * Builder for constructing BenchFrame instances with fluent API. Provides fine-grained control + * over all aspects of benchmark configuration including datasets, configuration, + * result handling, and checkpointing. + *

+ * Default values: + *

    + *
  • datasetNames: empty list
  • + *
  • config: null (must be set via withConfig or withConfigFunction)
  • + *
  • configFunction: null
  • + *
  • dataSetSource: {@link DataSetSource#DEFAULT}
  • + *
  • resultHandler: {@link ResultHandler#consoleOnly()}
  • + *
  • checkpointStrategy: {@link CheckpointStrategy#none()}
  • + *
  • collectResults: false
  • + *
  • diagnosticLevel: 0
  • + *
+ */ + public static class Builder { + private List datasetNames = List.of(); + private BenchFrameConfig config = null; + private Function configFunction = null; + private DataSetSource dataSetSource = DataSetSource.DEFAULT; + private ResultHandler resultHandler = ResultHandler.consoleOnly(); + private CheckpointStrategy checkpointStrategy = CheckpointStrategy.none(); + private boolean collectResults = false; + private int diagnosticLevel = 0; + + /** + * Sets the list of dataset names to benchmark. The provided list is copied to prevent external modification. + * + * @param datasetNames the list of dataset names to benchmark + * @return this builder for method chaining + */ + public Builder withDatasetNames(List datasetNames) { + this.datasetNames = new ArrayList<>(datasetNames); + return this; + } + + /** + * Sets a single configuration to use for all datasets. + * Mutually exclusive with {@link #withConfigFunction}. + * + * @param config the configuration to use for all datasets + * @return this builder for method chaining + */ + public Builder withConfig(BenchFrameConfig config) { + this.config = config; + this.configFunction = null; + return this; + } + + /** + * Sets a function to generate configuration per dataset (e.g., for YAML-based config). + * Mutually exclusive with {@link #withConfig}. + * + * @param configFunction function mapping dataset name to configuration + * @return this builder for method chaining + */ + public Builder withConfigFunction(Function configFunction) { + this.configFunction = configFunction; + this.config = null; + return this; + } + + /** + * Sets the DataSetSource for loading datasets by name. + * + * @param source the dataset source to use + * @return this builder for method chaining + */ + public Builder withDataSetSource(DataSetSource source) { + this.dataSetSource = source; + return this; + } + + /** + * Sets the result handler strategy for processing benchmark results. + * + * @param handler the result handler strategy to use + * @return this builder for method chaining + * @see ResultHandler#consoleOnly() + * @see ResultHandler#toFiles(String) + */ + public Builder withResultHandler(ResultHandler handler) { + this.resultHandler = handler; + return this; + } + + /** + * Sets the checkpoint strategy for tracking and resuming benchmark progress. + * + * @param strategy the checkpoint strategy to use + * @return this builder for method chaining + * @see CheckpointStrategy#none() + * @see CheckpointStrategy#fileBasedCheckpointing(String) + */ + public Builder withCheckpointStrategy(CheckpointStrategy strategy) { + this.checkpointStrategy = strategy; + return this; + } + + /** + * Enables or disables result collection. When enabled, benchmark results are collected and returned + * from the execution. This is required for file output and checkpointing functionality. + * + * @param collect true to collect results, false to discard them + * @return this builder for method chaining + */ + public Builder collectResults(boolean collect) { + this.collectResults = collect; + return this; + } + + /** + * Sets the diagnostic level for Grid execution output. + * + * @param level diagnostic level: 0=none, 1=basic, 2=detailed, 3=verbose + * @return this builder for method chaining + */ + public Builder withDiagnosticLevel(int level) { + this.diagnosticLevel = level; + return this; + } + + /** + * Builds and returns a configured BenchFrame instance. + * + * @return a new BenchFrame instance with the configured settings + */ + public BenchFrame build() { + return new BenchFrame(this); + } + } + + /** + * Main entry point for command-line execution. Delegates to {@link BenchFrameCLI} for + * command-line parsing and subcommand handling. + * + * @param args command-line arguments + * @see BenchFrameCLI + */ + public static void main(String[] args) { + int exitCode = new CommandLine(new BenchFrameCLI()).execute(args); + System.exit(exitCode); + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java new file mode 100644 index 000000000..7c8c9512e --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java @@ -0,0 +1,227 @@ +package io.github.jbellis.jvector.benchframe; + +import picocli.CommandLine; + +import java.io.IOException; +import java.util.concurrent.Callable; + +/** + * Command-line interface for BenchFrame using PicoCLI. Provides subcommands for all + * benchmark modes from the original benchmark classes (Bench, BenchYAML, AutoBenchYAML) + * plus dataset management via integration with nbvectors CLI. + *

+ * This CLI serves as the primary entry point for command-line benchmark execution and + * delegates to {@link BenchFrame} for actual benchmark orchestration. + * + *

Available Subcommands

+ *
    + *
  • {@code bench} - Run with hardcoded grid parameters (Bench.java style)
  • + *
  • {@code benchyaml} - Run with YAML-based configuration (BenchYAML.java style)
  • + *
  • {@code autobenchyaml} - Run in CI/CD mode with checkpointing (AutoBenchYAML.java style)
  • + *
  • {@code datasets} - List and manage vector datasets (delegates to nbvectors)
  • + *
  • {@code nbvectors} - Access full nbvectors CLI functionality
  • + *
+ * + *

Usage Examples

+ *
+ * # Show help
+ * java -jar benchframe.jar --help
+ *
+ * # Run Bench-style on specific datasets
+ * java -jar benchframe.jar bench "dataset1|dataset2"
+ *
+ * # Run YAML-style on all datasets
+ * java -jar benchframe.jar benchyaml
+ *
+ * # Run CI/CD mode with output files
+ * java -jar benchframe.jar autobenchyaml -o results/benchmark
+ *
+ * # List available datasets
+ * java -jar benchframe.jar datasets
+ *
+ * # Access nbvectors CLI
+ * java -jar benchframe.jar nbvectors --help
+ * 
+ * + * @see BenchFrame + * @see BenchCommand + * @see BenchYAMLCommand + * @see AutoBenchYAMLCommand + * @see DatasetsCommand + * @see NBVectorsCommand + */ +@CommandLine.Command( + name = "benchframe", + mixinStandardHelpOptions = true, + version = "1.0", + description = "Unified benchmark framework for JVector graph indexes", + subcommands = { + BenchFrameCLI.BenchCommand.class, + BenchFrameCLI.BenchYAMLCommand.class, + BenchFrameCLI.AutoBenchYAMLCommand.class, + BenchFrameCLI.DatasetsCommand.class, + BenchFrameCLI.NBVectorsCommand.class + } +) +public class BenchFrameCLI implements Callable { + + /** + * Called when no subcommand is specified. Displays help information. + * + * @return exit code 0 + */ + @Override + public Integer call() { + // If no subcommand, show help + CommandLine.usage(this, System.out); + return 0; + } + + /** + * Subcommand for running Bench-style benchmarks with hardcoded grid parameters. + * Provides compatibility with the original Bench.java behavior. + *

+ * Uses fixed default parameters (M=32, efConstruction=100, etc.) and loads + * datasets from the DatasetCollection. + */ + @CommandLine.Command( + name = "bench", + description = "Run benchmarks with hardcoded grid parameters (original Bench.java style)" + ) + static class BenchCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Dataset name patterns (regex). If not specified, matches all datasets." + ) + private String[] datasets = new String[0]; + + @Override + public Integer call() throws IOException { + System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); + BenchFrame.likeBench().execute(datasets); + return 0; + } + } + + /** + * Subcommand for running BenchYAML-style benchmarks with YAML-based configuration. + * Provides compatibility with the original BenchYAML.java behavior. + *

+ * Loads benchmark parameters from YAML files per dataset, allowing different + * configurations for different datasets. + */ + @CommandLine.Command( + name = "benchyaml", + description = "Run benchmarks with YAML-based configuration (original BenchYAML.java style)" + ) + static class BenchYAMLCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Dataset name patterns (regex) or YAML config files. If not specified, matches all datasets." + ) + private String[] datasets = new String[0]; + + @Override + public Integer call() throws IOException { + System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); + BenchFrame.likeBenchYAML().execute(datasets); + return 0; + } + } + + /** + * Subcommand for running AutoBench-style benchmarks in CI/CD mode with checkpointing. + * Provides compatibility with the original AutoBenchYAML.java behavior. + *

+ * Features: + *

    + *
  • File-based checkpointing for resumption after failures
  • + *
  • CSV summary and JSON detail output
  • + *
  • Hardcoded dataset list for consistent CI/CD runs
  • + *
  • Configurable diagnostic output level
  • + *
+ */ + @CommandLine.Command( + name = "autobenchyaml", + description = "Run benchmarks for CI/CD with checkpointing and file output (original AutoBenchYAML.java style)" + ) + static class AutoBenchYAMLCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Dataset name patterns (regex). If not specified, matches all datasets." + ) + private String[] datasets = new String[0]; + + @CommandLine.Option( + names = {"-o", "--output"}, + required = true, + description = "Base path for output files (.csv, .json, .checkpoint.json)" + ) + private String outputPath; + + @CommandLine.Option( + names = {"-d", "--diag"}, + description = "Diagnostic level: 0=none, 1=basic, 2=detailed, 3=verbose (default: ${DEFAULT-VALUE})", + defaultValue = "0" + ) + private int diagnosticLevel; + + @Override + public Integer call() throws IOException { + System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); + BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(datasets); + return 0; + } + } + + /** + * Subcommand that delegates to the datatools-nbvectors datasets command. + * Provides access to dataset listing and management functionality. + */ + @CommandLine.Command( + name = "datasets", + description = "List and manage vector datasets (delegates to nbvectors datasets command)" + ) + static class DatasetsCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Arguments to pass to the nbvectors datasets command" + ) + private String[] args = new String[0]; + + @Override + public Integer call() throws Exception { + // Delegate to CommandBundler with datasets subcommand + String[] nbvectorArgs = new String[args.length + 1]; + nbvectorArgs[0] = "datasets"; + System.arraycopy(args, 0, nbvectorArgs, 1, args.length); + + io.nosqlbench.commands.CommandBundler.main(nbvectorArgs); + return 0; + } + } + + /** + * Subcommand that delegates to the datatools-nbvectors main CLI. + * Provides access to the full nbvectors command-line functionality. + */ + @CommandLine.Command( + name = "nbvectors", + description = "Access full nbvectors CLI functionality (delegates to CommandBundler)" + ) + static class NBVectorsCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Arguments to pass to the nbvectors CLI" + ) + private String[] args = new String[0]; + + @Override + public Integer call() throws Exception { + // Delegate to CommandBundler + io.nosqlbench.commands.CommandBundler.main(args); + return 0; + } + } + +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java new file mode 100644 index 000000000..a3991d5b9 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java @@ -0,0 +1,490 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import io.github.jbellis.jvector.example.util.CompressorParameters; +import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters; +import io.github.jbellis.jvector.example.util.DataSet; +import io.github.jbellis.jvector.example.yaml.MultiConfig; +import io.github.jbellis.jvector.graph.disk.feature.FeatureId; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; + +import java.util.*; +import java.util.function.Function; + +import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; + +/** + * Typesafe configuration class for benchmark execution. Provides a unified, immutable configuration + * model that encapsulates all parameters needed to run a benchmark including graph construction + * parameters, search parameters, and feature sets. + *

+ * This class can be used programmatically through its {@link Builder} or constructed from + * YAML-based {@link MultiConfig} using {@link #fromMultiConfig(MultiConfig)}. + *

+ * All collections returned by getter methods are unmodifiable to maintain immutability. + * + *

Usage Examples

+ *
{@code
+ * // Create from YAML MultiConfig
+ * MultiConfig yaml = MultiConfig.getDefaultConfig("dataset-name");
+ * BenchFrameConfig config = BenchFrameConfig.fromMultiConfig(yaml);
+ *
+ * // Create with Builder
+ * BenchFrameConfig config = new BenchFrameConfig.Builder()
+ *     .withDatasetName("my-dataset")
+ *     .withMGrid(List.of(16, 32, 64))
+ *     .withEfConstructionGrid(List.of(100, 200))
+ *     .build();
+ *
+ * // Use default Bench-style configuration
+ * BenchFrameConfig defaults = BenchFrameConfig.createBenchDefaults();
+ * }
+ * + * @see BenchFrameConfigSource + * @see MultiConfig + */ +public class BenchFrameConfig { + // Dataset identification + private final String datasetName; + + // Graph construction parameters + private final List mGrid; + private final List efConstructionGrid; + private final List neighborOverflowGrid; + private final List addHierarchyGrid; + private final List refineFinalGraphGrid; + private final List> featureSets; + private final List> buildCompressors; + + // Search parameters + private final List> searchCompressors; + private final Map> topKOverqueryGrid; + private final List usePruningGrid; + + // Benchmark selection + private final Map> benchmarkSpec; + + // Result collection mode + private final boolean collectResults; + + private BenchFrameConfig(Builder builder) { + this.datasetName = builder.datasetName; + this.mGrid = Collections.unmodifiableList(builder.mGrid); + this.efConstructionGrid = Collections.unmodifiableList(builder.efConstructionGrid); + this.neighborOverflowGrid = Collections.unmodifiableList(builder.neighborOverflowGrid); + this.addHierarchyGrid = Collections.unmodifiableList(builder.addHierarchyGrid); + this.refineFinalGraphGrid = Collections.unmodifiableList(builder.refineFinalGraphGrid); + this.featureSets = Collections.unmodifiableList(builder.featureSets); + this.buildCompressors = Collections.unmodifiableList(builder.buildCompressors); + this.searchCompressors = Collections.unmodifiableList(builder.searchCompressors); + this.topKOverqueryGrid = Collections.unmodifiableMap(builder.topKOverqueryGrid); + this.usePruningGrid = Collections.unmodifiableList(builder.usePruningGrid); + this.benchmarkSpec = builder.benchmarkSpec == null ? null : Collections.unmodifiableMap(builder.benchmarkSpec); + this.collectResults = builder.collectResults; + } + + /** + * Returns the dataset name associated with this configuration. + * + * @return the dataset name, may be null if not specified + */ + public String getDatasetName() { return datasetName; } + + /** + * Returns the grid of M (max connections per node) values to test. + * + * @return unmodifiable list of M values + */ + public List getMGrid() { return mGrid; } + + /** + * Returns the grid of efConstruction values to test during graph construction. + * + * @return unmodifiable list of efConstruction values + */ + public List getEfConstructionGrid() { return efConstructionGrid; } + + /** + * Returns the grid of neighbor overflow multipliers to test. This controls how many + * candidate neighbors are considered relative to M during graph construction. + * + * @return unmodifiable list of neighbor overflow multipliers + */ + public List getNeighborOverflowGrid() { return neighborOverflowGrid; } + + /** + * Returns the grid of add hierarchy boolean values indicating whether to use hierarchical + * graph construction. + * + * @return unmodifiable list of boolean values + */ + public List getAddHierarchyGrid() { return addHierarchyGrid; } + + /** + * Returns the grid of refine final graph boolean values indicating whether to perform + * final graph refinement after construction. + * + * @return unmodifiable list of boolean values + */ + public List getRefineFinalGraphGrid() { return refineFinalGraphGrid; } + + /** + * Returns the feature sets to test. Each set contains {@link FeatureId}s that enable + * specific features like inline vectors or NVQ vectors. + * + * @return unmodifiable list of feature sets + */ + public List> getFeatureSets() { return featureSets; } + + /** + * Returns the compressor functions to use during graph construction. Each function takes + * a {@link DataSet} and returns appropriate {@link CompressorParameters}. + * + * @return unmodifiable list of compressor parameter functions + */ + public List> getBuildCompressors() { return buildCompressors; } + + /** + * Returns the compressor functions to use during search. Each function takes + * a {@link DataSet} and returns appropriate {@link CompressorParameters}. + * + * @return unmodifiable list of compressor parameter functions + */ + public List> getSearchCompressors() { return searchCompressors; } + + /** + * Returns the grid of topK overquery multipliers mapped by K value. For example, + * a map entry of (10, [1.0, 2.0, 5.0]) means for top-10 queries, test overquery + * factors of 1.0x, 2.0x, and 5.0x. + * + * @return unmodifiable map of K values to overquery multipliers + */ + public Map> getTopKOverqueryGrid() { return topKOverqueryGrid; } + + /** + * Returns the grid of boolean values indicating whether to use search pruning. + * + * @return unmodifiable list of boolean values + */ + public List getUsePruningGrid() { return usePruningGrid; } + + /** + * Returns the benchmark specification mapping benchmark types to their configurations. + * A null value indicates all default benchmarks should be run. + * + * @return unmodifiable map of benchmark specifications, or null for default benchmarks + */ + public Map> getBenchmarkSpec() { return benchmarkSpec; } + + /** + * Returns whether results should be collected and returned from benchmark execution. + * + * @return true if results should be collected, false otherwise + */ + public boolean shouldCollectResults() { return collectResults; } + + /** + * Creates a new {@link Builder} initialized with this configuration's values. + * This is useful for creating modified copies of existing configurations. + * + * @return a new Builder with this configuration's values + */ + public Builder toBuilder() { + return new Builder() + .withDatasetName(datasetName) + .withMGrid(mGrid) + .withEfConstructionGrid(efConstructionGrid) + .withNeighborOverflowGrid(neighborOverflowGrid) + .withAddHierarchyGrid(addHierarchyGrid) + .withRefineFinalGraphGrid(refineFinalGraphGrid) + .withFeatureSets(featureSets) + .withBuildCompressors(buildCompressors) + .withSearchCompressors(searchCompressors) + .withTopKOverqueryGrid(topKOverqueryGrid) + .withUsePruningGrid(usePruningGrid) + .withBenchmarkSpec(benchmarkSpec) + .collectResults(collectResults); + } + + /** + * Creates a BenchFrameConfig from a YAML-based {@link MultiConfig}. This factory method + * provides compatibility with the existing YAML configuration system. + * + * @param config the MultiConfig to convert + * @return a new BenchFrameConfig with values from the MultiConfig + */ + public static BenchFrameConfig fromMultiConfig(MultiConfig config) { + return new Builder() + .withDatasetName(config.dataset) + .withMGrid(config.construction.outDegree) + .withEfConstructionGrid(config.construction.efConstruction) + .withNeighborOverflowGrid(config.construction.neighborOverflow) + .withAddHierarchyGrid(config.construction.addHierarchy) + .withRefineFinalGraphGrid(config.construction.refineFinalGraph) + .withFeatureSets(config.construction.getFeatureSets()) + .withBuildCompressors(config.construction.getCompressorParameters()) + .withSearchCompressors(config.search.getCompressorParameters()) + .withTopKOverqueryGrid(config.search.topKOverquery) + .withUsePruningGrid(config.search.useSearchPruning) + .withBenchmarkSpec(config.search.benchmarks) + .build(); + } + + /** + * Creates a default configuration matching the original Bench.java's hardcoded parameters. + * This provides a baseline configuration suitable for most benchmark scenarios. + *

+ * Default values include: + *

    + *
  • M: 32
  • + *
  • efConstruction: 100
  • + *
  • neighborOverflow: 1.2
  • + *
  • addHierarchy: true
  • + *
  • refineFinalGraph: true
  • + *
  • usePruning: true
  • + *
  • topK overquery: 10 -> [1.0, 2.0, 5.0, 10.0], 100 -> [1.0, 2.0]
  • + *
  • Feature sets: NVQ_VECTORS and INLINE_VECTORS
  • + *
  • Compressors: PQ for build, both none and PQ for search
  • + *
+ * + * @return a new BenchFrameConfig with default Bench.java values + */ + public static BenchFrameConfig createBenchDefaults() { + return new Builder() + .withMGrid(List.of(32)) + .withEfConstructionGrid(List.of(100)) + .withNeighborOverflowGrid(List.of(1.2f)) + .withAddHierarchyGrid(List.of(true)) + .withRefineFinalGraphGrid(List.of(true)) + .withUsePruningGrid(List.of(true)) + .withTopKOverqueryGrid(Map.of( + 10, List.of(1.0, 2.0, 5.0, 10.0), + 100, List.of(1.0, 2.0) + )) + .withFeatureSets(Arrays.asList( + EnumSet.of(FeatureId.NVQ_VECTORS), + EnumSet.of(FeatureId.INLINE_VECTORS) + )) + .withBuildCompressors(Arrays.asList( + ds -> new PQParameters(ds.getDimension() / 8, + 256, + ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN, + UNWEIGHTED), + __ -> CompressorParameters.NONE + )) + .withSearchCompressors(Arrays.asList( + __ -> CompressorParameters.NONE, + ds -> new PQParameters(ds.getDimension() / 8, + 256, + ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN, + UNWEIGHTED) + )) + .build(); + } + + /** + * Builder for fluent BenchFrameConfig construction. All builder methods return the builder + * instance for method chaining. Collections provided to builder methods are defensively + * copied to prevent external modification. + *

+ * Default values provide sensible single-value grids: + *

    + *
  • mGrid: [32]
  • + *
  • efConstructionGrid: [100]
  • + *
  • neighborOverflowGrid: [1.2]
  • + *
  • addHierarchyGrid: [true]
  • + *
  • refineFinalGraphGrid: [true]
  • + *
  • featureSets: [INLINE_VECTORS]
  • + *
  • buildCompressors: [NONE]
  • + *
  • searchCompressors: [NONE]
  • + *
  • topKOverqueryGrid: {10: [1.0]}
  • + *
  • usePruningGrid: [true]
  • + *
  • benchmarkSpec: null (use default benchmarks)
  • + *
  • collectResults: false
  • + *
+ */ + public static class Builder { + private String datasetName; + private List mGrid = List.of(32); + private List efConstructionGrid = List.of(100); + private List neighborOverflowGrid = List.of(1.2f); + private List addHierarchyGrid = List.of(true); + private List refineFinalGraphGrid = List.of(true); + private List> featureSets = List.of(EnumSet.of(FeatureId.INLINE_VECTORS)); + private List> buildCompressors = + List.of(__ -> CompressorParameters.NONE); + private List> searchCompressors = + List.of(__ -> CompressorParameters.NONE); + private Map> topKOverqueryGrid = Map.of(10, List.of(1.0)); + private List usePruningGrid = List.of(true); + private Map> benchmarkSpec = null; // null means use default benchmarks + private boolean collectResults = false; + + /** + * Sets the dataset name. + * + * @param datasetName the dataset name to associate with this configuration + * @return this builder for method chaining + */ + public Builder withDatasetName(String datasetName) { + this.datasetName = datasetName; + return this; + } + + /** + * Sets the grid of M (max connections per node) values to test. + * + * @param mGrid list of M values, defensively copied + * @return this builder for method chaining + */ + public Builder withMGrid(List mGrid) { + this.mGrid = new ArrayList<>(mGrid); + return this; + } + + /** + * Sets the grid of efConstruction values to test during graph construction. + * + * @param efConstructionGrid list of efConstruction values, defensively copied + * @return this builder for method chaining + */ + public Builder withEfConstructionGrid(List efConstructionGrid) { + this.efConstructionGrid = new ArrayList<>(efConstructionGrid); + return this; + } + + /** + * Sets the grid of neighbor overflow multipliers to test. + * + * @param neighborOverflowGrid list of overflow multipliers, defensively copied + * @return this builder for method chaining + */ + public Builder withNeighborOverflowGrid(List neighborOverflowGrid) { + this.neighborOverflowGrid = new ArrayList<>(neighborOverflowGrid); + return this; + } + + /** + * Sets the grid of add hierarchy boolean values. + * + * @param addHierarchyGrid list of boolean values, defensively copied + * @return this builder for method chaining + */ + public Builder withAddHierarchyGrid(List addHierarchyGrid) { + this.addHierarchyGrid = new ArrayList<>(addHierarchyGrid); + return this; + } + + /** + * Sets the grid of refine final graph boolean values. + * + * @param refineFinalGraphGrid list of boolean values, defensively copied + * @return this builder for method chaining + */ + public Builder withRefineFinalGraphGrid(List refineFinalGraphGrid) { + this.refineFinalGraphGrid = new ArrayList<>(refineFinalGraphGrid); + return this; + } + + /** + * Sets the feature sets to test. + * + * @param featureSets list of feature sets, defensively copied + * @return this builder for method chaining + */ + public Builder withFeatureSets(List> featureSets) { + this.featureSets = new ArrayList<>(featureSets); + return this; + } + + /** + * Sets the compressor functions to use during graph construction. + * + * @param buildCompressors list of compressor parameter functions, defensively copied + * @return this builder for method chaining + */ + public Builder withBuildCompressors(List> buildCompressors) { + this.buildCompressors = new ArrayList<>(buildCompressors); + return this; + } + + /** + * Sets the compressor functions to use during search. + * + * @param searchCompressors list of compressor parameter functions, defensively copied + * @return this builder for method chaining + */ + public Builder withSearchCompressors(List> searchCompressors) { + this.searchCompressors = new ArrayList<>(searchCompressors); + return this; + } + + /** + * Sets the grid of topK overquery multipliers. + * + * @param topKOverqueryGrid map of K values to overquery multipliers, defensively copied + * @return this builder for method chaining + */ + public Builder withTopKOverqueryGrid(Map> topKOverqueryGrid) { + this.topKOverqueryGrid = new HashMap<>(topKOverqueryGrid); + return this; + } + + /** + * Sets the grid of use pruning boolean values. + * + * @param usePruningGrid list of boolean values, defensively copied + * @return this builder for method chaining + */ + public Builder withUsePruningGrid(List usePruningGrid) { + this.usePruningGrid = new ArrayList<>(usePruningGrid); + return this; + } + + /** + * Sets the benchmark specification. A null value indicates default benchmarks should be used. + * + * @param benchmarkSpec map of benchmark specifications, defensively copied if not null + * @return this builder for method chaining + */ + public Builder withBenchmarkSpec(Map> benchmarkSpec) { + this.benchmarkSpec = benchmarkSpec == null ? null : new HashMap<>(benchmarkSpec); + return this; + } + + /** + * Sets whether to collect results. + * + * @param collectResults true to collect results, false otherwise + * @return this builder for method chaining + */ + public Builder collectResults(boolean collectResults) { + this.collectResults = collectResults; + return this; + } + + /** + * Builds and returns a configured BenchFrameConfig instance with immutable collections. + * + * @return a new BenchFrameConfig with the configured values + */ + public BenchFrameConfig build() { + return new BenchFrameConfig(this); + } + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java new file mode 100644 index 000000000..6abc0352f --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java @@ -0,0 +1,84 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.jbellis.jvector.benchframe; + +import java.util.Map; + +/** + * Result model for a single benchmark execution. Encapsulates the dataset identifier, + * configuration parameters, and performance metrics from a benchmark run. + *

+ * This class is designed for serialization to JSON and CSV formats through {@link ResultHandler} + * implementations. All fields are public for compatibility with Jackson and other serialization + * libraries. + *

+ * Typical parameter keys include: + *

    + *
  • {@code M} - max connections per node
  • + *
  • {@code efConstruction} - construction-time search depth
  • + *
  • {@code buildCompressor} - compression used during construction
  • + *
  • {@code searchCompressor} - compression used during search
  • + *
  • {@code featureSet} - enabled feature flags
  • + *
+ *

+ * Typical metric keys include: + *

    + *
  • {@code recall} - search accuracy (0.0 to 1.0)
  • + *
  • {@code qps} - queries per second
  • + *
  • {@code latency} - average query latency in milliseconds
  • + *
  • {@code buildTimeMs} - index construction time in milliseconds
  • + *
  • {@code indexSizeBytes} - on-disk index size in bytes
  • + *
+ * + * @see ResultHandler + * @see BenchFrame + */ +public class BenchResult { + /** + * The name of the dataset this result is for. + */ + public String dataset; + + /** + * Map of configuration parameters used for this benchmark run. + * Keys are parameter names, values are parameter values (typically String, Integer, Boolean, etc.). + */ + public Map parameters; + + /** + * Map of performance metrics measured during this benchmark run. + * Keys are metric names, values are metric values (typically Double, Long, Integer, etc.). + */ + public Map metrics; + + /** + * Default constructor for deserialization. + */ + public BenchResult() {} + + /** + * Constructs a BenchResult with the specified dataset, parameters, and metrics. + * + * @param dataset the dataset name + * @param parameters map of configuration parameters + * @param metrics map of performance metrics + */ + public BenchResult(String dataset, Map parameters, Map metrics) { + this.dataset = dataset; + this.parameters = parameters; + this.metrics = metrics; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java new file mode 100644 index 000000000..f99f722da --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java @@ -0,0 +1,177 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import io.github.jbellis.jvector.example.util.CheckpointManager; + +import java.util.Collections; +import java.util.List; + +/** + * Strategy interface for managing benchmark checkpointing. Implements the Strategy pattern + * to enable resumable benchmark execution after failures or interruptions. + *

+ * Checkpointing is particularly valuable for long-running benchmarks in CI/CD environments + * where resource limits or transient failures may interrupt execution. By tracking which + * datasets have been completed, benchmarks can resume from where they left off. + *

+ * Two implementations are provided: + *

    + *
  • {@link NoCheckpointing} - no-op implementation for simple scenarios
  • + *
  • {@link FileCheckpointing} - persistent file-based checkpointing using JSON
  • + *
+ * + *

Usage Example

+ *
{@code
+ * // No checkpointing (default)
+ * CheckpointStrategy strategy = CheckpointStrategy.none();
+ *
+ * // File-based checkpointing
+ * CheckpointStrategy strategy = CheckpointStrategy.fileBasedCheckpointing("results/checkpoint");
+ *
+ * // Custom implementation
+ * CheckpointStrategy strategy = new CheckpointStrategy() {
+ *     @Override
+ *     public boolean shouldSkipDataset(String datasetName) {
+ *         // Check database or cache
+ *         return completedDatasets.contains(datasetName);
+ *     }
+ *
+ *     @Override
+ *     public void recordCompletion(String datasetName, List results) {
+ *         // Update database or cache
+ *         completedDatasets.add(datasetName);
+ *     }
+ *
+ *     @Override
+ *     public List getPreviousResults() {
+ *         // Load from database or cache
+ *         return loadPreviousResults();
+ *     }
+ * };
+ * }
+ * + * @see BenchFrame.Builder#withCheckpointStrategy(CheckpointStrategy) + * @see BenchResult + */ +public interface CheckpointStrategy { + /** + * Checks if a dataset should be skipped because it has already been completed. + * This is called before attempting to benchmark each dataset. + * + * @param datasetName the name of the dataset to check + * @return true if the dataset has already been completed and should be skipped, false otherwise + */ + boolean shouldSkipDataset(String datasetName); + + /** + * Records the completion of a dataset with its results. This is called after successfully + * benchmarking a dataset. Implementations should persist this information to enable resumption. + * + * @param datasetName the name of the completed dataset + * @param results the benchmark results for this dataset + */ + void recordCompletion(String datasetName, List results); + + /** + * Retrieves any previously completed results from earlier runs. These results are included + * in the final output to provide a complete view across multiple executions. + * + * @return list of results from previous runs, or empty list if none exist + */ + List getPreviousResults(); + + /** + * Creates a no-op checkpoint strategy that does not track or resume progress. + * This is the default for simple benchmark scenarios. + * + * @return a checkpoint strategy that performs no checkpointing + */ + static CheckpointStrategy none() { + return new NoCheckpointing(); + } + + /** + * Creates a file-based checkpoint strategy that persists progress to JSON files. + * Creates files at {@code outputPath.checkpoint.json} containing completed dataset + * names and their results. + * + * @param outputPath base path for checkpoint file (e.g., "results/benchmark") + * @return a checkpoint strategy using file-based persistence + * @see FileCheckpointing + */ + static CheckpointStrategy fileBasedCheckpointing(String outputPath) { + return new FileCheckpointing(outputPath); + } + + /** + * No-op implementation that performs no checkpointing. All datasets are processed + * on every run without tracking completion state. + */ + class NoCheckpointing implements CheckpointStrategy { + @Override + public boolean shouldSkipDataset(String datasetName) { + return false; + } + + @Override + public void recordCompletion(String datasetName, List results) { + // Do nothing + } + + @Override + public List getPreviousResults() { + return Collections.emptyList(); + } + } + + /** + * File-based implementation that uses {@link CheckpointManager} for persistent checkpointing. + * Stores checkpoint state in a JSON file at {@code outputPath.checkpoint.json}. + *

+ * The checkpoint file contains: + *

    + *
  • List of completed dataset names
  • + *
  • All benchmark results from completed datasets
  • + *
  • Timestamp of last update
  • + *
+ *

+ * On initialization, loads any existing checkpoint file to resume from previous runs. + */ + class FileCheckpointing implements CheckpointStrategy { + private final CheckpointManager manager; + + public FileCheckpointing(String outputPath) { + this.manager = new CheckpointManager(outputPath); + } + + @Override + public boolean shouldSkipDataset(String datasetName) { + return manager.isDatasetCompleted(datasetName); + } + + @Override + public void recordCompletion(String datasetName, List results) { + manager.markDatasetCompleted(datasetName, results); + } + + @Override + public List getPreviousResults() { + return manager.getCompletedResults(); + } + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java new file mode 100644 index 000000000..da972b984 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java @@ -0,0 +1,201 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import com.fasterxml.jackson.databind.ObjectMapper; +import io.github.jbellis.jvector.example.util.BenchmarkSummarizer; +import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * Strategy interface for handling benchmark results after execution completes. + * Implements the Strategy pattern to decouple result handling from benchmark execution. + *

+ * This functional interface supports various output modes including: + *

    + *
  • Console-only output (Grid handles printing)
  • + *
  • File-based output (CSV summary and JSON details)
  • + *
  • Combined output to multiple destinations
  • + *
  • Custom implementations for specialized scenarios
  • + *
+ * + *

Usage Examples

+ *
{@code
+ * // Console only (default)
+ * ResultHandler handler = ResultHandler.consoleOnly();
+ *
+ * // Write to files
+ * ResultHandler handler = ResultHandler.toFiles("results/benchmark");
+ *
+ * // Combine multiple handlers
+ * ResultHandler handler = ResultHandler.combining(
+ *     ResultHandler.consoleOnly(),
+ *     ResultHandler.toFiles("results/benchmark")
+ * );
+ *
+ * // Custom implementation
+ * ResultHandler handler = results -> {
+ *     // Send to monitoring system
+ *     monitoringService.recordBenchmarks(results);
+ *     // Upload to cloud storage
+ *     cloudStorage.upload("benchmarks", results);
+ * };
+ * }
+ * + * @see BenchResult + * @see BenchFrame.Builder#withResultHandler(ResultHandler) + */ +@FunctionalInterface +public interface ResultHandler { + /** + * Handles the benchmark results after execution completes. Implementations may write + * to files, send to external systems, or perform other processing. + * + * @param results list of benchmark results to handle + * @throws IOException if output or I/O operations fail + */ + void handleResults(List results) throws IOException; + + /** + * Creates a no-op result handler that does nothing with results. Console output + * is already handled by {@link Grid} during benchmark execution. + * This matches the behavior of the original Bench.java and BenchYAML.java. + * + * @return a result handler that performs no additional output + */ + static ResultHandler consoleOnly() { + return results -> { + // Grid already printed results to console, nothing to do + }; + } + + /** + * Creates a result handler that writes results to CSV summary and JSON detail files. + * This matches the behavior of AutoBenchYAML.java. + *

+ * Files created: + *

    + *
  • {@code outputBasePath.csv} - CSV summary with aggregate statistics per dataset
  • + *
  • {@code outputBasePath.json} - JSON file with complete detailed results
  • + *
+ *

+ * The CSV file contains columns: dataset, QPS, QPS StdDev, Mean Latency, Recall@10, + * Index Construction Time. + * + * @param outputBasePath base path for output files (without extension) + * @return a result handler that writes to CSV and JSON files + * @see FileOutputHandler + */ + static ResultHandler toFiles(String outputBasePath) { + return new FileOutputHandler(outputBasePath); + } + + /** + * Implementation that writes benchmark results to CSV summary and JSON details files. + * Uses {@link BenchmarkSummarizer} to calculate aggregate statistics across multiple + * benchmark runs. + */ + class FileOutputHandler implements ResultHandler { + private static final Logger logger = LoggerFactory.getLogger(FileOutputHandler.class); + private final String outputBasePath; + + public FileOutputHandler(String outputBasePath) { + this.outputBasePath = outputBasePath; + } + + @Override + public void handleResults(List results) throws IOException { + if (results.isEmpty()) { + logger.warn("No results to write"); + return; + } + + // Calculate summary statistics + SummaryStats stats = BenchmarkSummarizer.summarize(results); + logger.info("Benchmark summary: {}", stats.toString()); + + // Write detailed results to JSON + File detailsFile = new File(outputBasePath + ".json"); + ObjectMapper mapper = new ObjectMapper(); + mapper.writerWithDefaultPrettyPrinter().writeValue(detailsFile, results); + logger.info("Detailed results written to {}", detailsFile.getAbsolutePath()); + + // Write summary to CSV + File csvFile = new File(outputBasePath + ".csv"); + writeCsvSummary(results, csvFile); + logger.info("Summary results written to {}", csvFile.getAbsolutePath()); + + // Verify files were created + if (csvFile.exists()) { + logger.info("CSV file size: {} bytes", csvFile.length()); + } else { + logger.error("Failed to create CSV file at {}", csvFile.getAbsolutePath()); + } + + if (detailsFile.exists()) { + logger.info("JSON file size: {} bytes", detailsFile.length()); + } else { + logger.error("Failed to create JSON file at {}", detailsFile.getAbsolutePath()); + } + } + + private void writeCsvSummary(List results, File outputFile) throws IOException { + // Get summary statistics by dataset + Map statsByDataset = BenchmarkSummarizer.summarizeByDataset(results); + + try (FileWriter writer = new FileWriter(outputFile)) { + // Write CSV header + writer.write("dataset,QPS,QPS StdDev,Mean Latency,Recall@10,Index Construction Time\n"); + + // Write one row per dataset with average metrics + for (Map.Entry entry : statsByDataset.entrySet()) { + String dataset = entry.getKey(); + SummaryStats datasetStats = entry.getValue(); + + writer.write(dataset + ","); + writer.write(datasetStats.getAvgQps() + ","); + writer.write(datasetStats.getQpsStdDev() + ","); + writer.write(datasetStats.getAvgLatency() + ","); + writer.write(datasetStats.getAvgRecall() + ","); + writer.write(datasetStats.getIndexConstruction() + "\n"); + } + } + } + } + + /** + * Creates a result handler that delegates to multiple handlers in sequence. + * If any handler throws an exception, subsequent handlers are not called. + * + * @param handlers the handlers to combine + * @return a result handler that invokes all provided handlers + */ + static ResultHandler combining(ResultHandler... handlers) { + return results -> { + for (ResultHandler handler : handlers) { + handler.handleResults(results); + } + }; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/TestDataViewWrapper.java similarity index 98% rename from jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java rename to jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/TestDataViewWrapper.java index 8ee51aca1..117fe9eda 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/TestDataViewWrapper.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/TestDataViewWrapper.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.github.jbellis.jvector.example; +package io.github.jbellis.jvector.benchframe; import io.github.jbellis.jvector.example.util.DataSet; import io.github.jbellis.jvector.example.util.FloatVectorsWrapper; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java new file mode 100644 index 000000000..e303e3ae5 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java @@ -0,0 +1,176 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Unified benchmark framework for JVector graph indexes. This package consolidates the functionality + * from the legacy benchmark classes (Bench, BenchYAML, AutoBenchYAML, Bench2D) into a modular, + * composable architecture using the Strategy pattern. + * + *

Usage

+ * If you are just wanting to run the bench commands you are used to, then you can do it this way: + *
    + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrame#likeBench()}.execute(...)
  • + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrame#likeBenchYAML()}.execute(...)
  • + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrame#likeAutoBenchYAML(java.lang.String, int)}.execute(...)
  • + *
+ * + * The rest of the docs here are more for development on the BenchFrame itself. + * + *

Package Overview

+ * The benchframe package provides a flexible framework for benchmarking JVector's approximate + * nearest neighbor search implementations. It supports multiple execution modes from simple + * interactive testing to complex CI/CD scenarios with checkpointing and automated result collection. + * + *

Core Components

+ * + *

Main Orchestrator

+ *
    + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrame} - Main orchestrator that coordinates + * benchmark execution using pluggable strategies
  • + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrameCLI} - Command-line interface providing + * subcommands for different benchmark modes
  • + *
+ * + *

Configuration

+ *
    + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrameConfig} - Immutable configuration class + * encapsulating all benchmark parameters
  • + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrameConfigSource} - Strategy interface for + * loading configurations from different sources (hardcoded, YAML, etc.)
  • + *
+ * + *

Result Handling

+ *
    + *
  • {@link io.github.jbellis.jvector.benchframe.BenchResult} - Result model encapsulating dataset, + * parameters, and metrics
  • + *
  • {@link io.github.jbellis.jvector.benchframe.ResultHandler} - Strategy interface for handling + * results (console, files, etc.)
  • + *
+ * + *

Checkpointing

+ *
    + *
  • {@link io.github.jbellis.jvector.benchframe.CheckpointStrategy} - Strategy interface for + * managing resumable benchmark execution
  • + *
+ * + *

Usage Patterns

+ * + *

Command-Line Usage

+ * The simplest way to use the framework is through the CLI: + *
+ * # Run with hardcoded parameters (Bench-style)
+ * java -jar benchframe.jar bench "dataset1|dataset2"
+ *
+ * # Run with YAML configuration (BenchYAML-style)
+ * java -jar benchframe.jar yaml
+ *
+ * # Run in CI/CD mode with checkpointing (AutoBenchYAML-style)
+ * java -jar benchframe.jar auto -o results/benchmark -d 2
+ *
+ * # Run with synthetic 2D data (Bench2D-style)
+ * java -jar benchframe.jar 2d -n 1000000 -q 10000
+ * 
+ * + *

Programmatic Usage - Factory Methods

+ * Factory methods provide pre-configured instances matching legacy behavior: + *
{@code
+ * // Bench-style: hardcoded defaults
+ * BenchFrame frame = BenchFrame.likeBench();
+ * frame.execute(new String[]{"dataset-name"});
+ *
+ * // BenchYAML-style: YAML configuration
+ * BenchFrame frame = BenchFrame.likeBenchYAML();
+ * frame.execute(new String[]{"dataset-name"});
+ *
+ * // AutoBenchYAML-style: CI/CD with checkpointing
+ * BenchFrame frame = BenchFrame.likeAutoBenchYAML("results/benchmark", 2);
+ * frame.execute(new String[]{"dataset-name"});
+ * }
+ * + *

Programmatic Usage - Custom Configuration

+ * The Builder API provides fine-grained control over all aspects: + *
{@code
+ * BenchFrame frame = new BenchFrame.Builder()
+ *     .withDatasetNames(List.of("dataset1", "dataset2"))
+ *     .withConfigurationProvider(BenchFrameConfigSource.fromYAML())
+ *     .withDataSetSource(DataSetSource.DEFAULT)
+ *     .withResultHandler(ResultHandler.toFiles("results/benchmark"))
+ *     .withCheckpointStrategy(CheckpointStrategy.fileBasedCheckpointing("results/checkpoint"))
+ *     .collectResults(true)
+ *     .withDiagnosticLevel(2)
+ *     .build();
+ *
+ * frame.execute(new String[]{".*"});
+ * }
+ * + *

Extension Points

+ * The framework is designed for extension through its strategy interfaces: + * + *

Custom Configuration Source

+ *
{@code
+ * BenchFrameConfigSource customSource = datasetName -> {
+ *     // Load from database, REST API, etc.
+ *     return new BenchFrameConfig.Builder()
+ *         .withDatasetName(datasetName)
+ *         .withMGrid(List.of(16, 32, 64))
+ *         .build();
+ * };
+ * }
+ * + *

Custom Result Handler

+ *
{@code
+ * ResultHandler customHandler = results -> {
+ *     // Send to monitoring system
+ *     monitoringSystem.record(results);
+ *     // Upload to cloud storage
+ *     cloudStorage.upload("benchmarks", results);
+ * };
+ * }
+ * + *

Custom Checkpoint Strategy

+ *
{@code
+ * CheckpointStrategy customStrategy = new CheckpointStrategy() {
+ *     public boolean shouldSkipDataset(String name) {
+ *         return database.isCompleted(name);
+ *     }
+ *     public void recordCompletion(String name, List results) {
+ *         database.markCompleted(name, results);
+ *     }
+ *     public List getPreviousResults() {
+ *         return database.loadPreviousResults();
+ *     }
+ * };
+ * }
+ * + *

Architecture Benefits

+ *
    + *
  • Modularity: Clean separation of concerns through strategy interfaces
  • + *
  • Composability: Mix and match strategies for different scenarios
  • + *
  • Testability: Easy to test components in isolation with mock strategies
  • + *
  • Extensibility: Add new strategies without modifying existing code
  • + *
  • Backward Compatibility: Factory methods preserve legacy behavior
  • + *
+ * + *

Thread Safety

+ * The framework components are generally not thread-safe and are designed for single-threaded + * benchmark execution. {@link io.github.jbellis.jvector.benchframe.BenchFrameConfig} instances + * are immutable and thread-safe once constructed. + * + * @see io.github.jbellis.jvector.benchframe.BenchFrame + * @see io.github.jbellis.jvector.benchframe.BenchFrameCLI + * @see io.github.jbellis.jvector.example.Grid + */ +package io.github.jbellis.jvector.benchframe; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java index 479ce331a..47582e227 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java @@ -16,227 +16,90 @@ package io.github.jbellis.jvector.example; -import com.fasterxml.jackson.databind.ObjectMapper; -import io.github.jbellis.jvector.example.util.*; -import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; -import io.github.jbellis.jvector.example.yaml.ConstructionParameters; -import io.github.jbellis.jvector.example.yaml.MultiConfig; -import io.github.jbellis.jvector.example.yaml.SearchParameters; -import io.github.jbellis.jvector.graph.disk.feature.FeatureId; - -import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator; -import io.nosqlbench.vectordata.discovery.TestDataSources; -import io.nosqlbench.vectordata.discovery.TestDataView; -import io.nosqlbench.vectordata.downloader.Catalog; -import io.nosqlbench.vectordata.downloader.DatasetEntry; -import org.jetbrains.annotations.NotNull; +import io.github.jbellis.jvector.benchframe.BenchFrame; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.io.FileWriter; import java.io.IOException; -import java.util.*; -import java.util.concurrent.CompletableFuture; -import java.util.regex.Pattern; -import java.util.stream.Collectors; +import java.util.Arrays; /** * Automated benchmark runner for GitHub Actions workflow. * This class is specifically designed to handle the --output argument * for regression testing in the run-bench.yml workflow. - * + * * The benchmark runner supports checkpointing to allow resuming from failures. * It creates a checkpoint file (outputPath + ".checkpoint.json") that records * which datasets have been fully processed. If the benchmark is restarted, * it will skip datasets that have already been processed, allowing it to * continue from where it left off rather than starting over from the beginning. + * + * This class has been refactored to use BenchFrame for modularity and DRY principles. + * All shared functionality is now in reusable modules. */ +@Deprecated public class AutoBenchYAML { private static final Logger logger = LoggerFactory.getLogger(AutoBenchYAML.class); - /** - * Returns a list of all dataset names. - * This replaces the need to load datasets.yml which may not be available in all environments. - */ - private static List getAllDatasetNames() { - List allDatasets = new ArrayList<>(); - allDatasets.add("cap-1M"); - allDatasets.add("cap-6M"); - allDatasets.add("cohere-english-v3-1M"); - allDatasets.add("cohere-english-v3-10M"); - allDatasets.add("dpr-1M"); - allDatasets.add("dpr-10M"); - - return allDatasets; - } - public static void main(String[] args) throws IOException { - // Check for --output argument (required for this class) - String outputPath = null; - for (int i = 0; i < args.length - 1; i++) { - if (args[i].equals("--output")) outputPath = args[i+1]; - } - + // Parse command-line arguments + String outputPath = extractArgument(args, "--output"); if (outputPath == null) { logger.error("Error: --output argument is required for AutoBenchYAML"); System.exit(1); } - logger.info("Heap space available is {}", Runtime.getRuntime().maxMemory()); - - // Initialize checkpoint manager - CheckpointManager checkpointManager = new CheckpointManager(outputPath); - logger.info("Initialized checkpoint manager. Already completed datasets: {}", checkpointManager.getCompletedDatasets()); - - // Filter out --output, --config and their arguments from the args - String finalOutputPath = outputPath; - String configPath = null; - int diagnostic_level = 0; - for (int i = 0; i < args.length - 1; i++) { - if (args[i].equals("--config")) configPath = args[i+1]; - if (args[i].equals("--diag")) diagnostic_level = Integer.parseInt(args[i+1]); - } - if (diagnostic_level > 0) { - Grid.setDiagnosticLevel(diagnostic_level); - } - String finalConfigPath = configPath; - String[] filteredArgs = Arrays.stream(args) - .filter(arg -> !arg.equals("--output") && !arg.equals(finalOutputPath) && - !arg.equals("--config") && !arg.equals(finalConfigPath)) - .toArray(String[]::new); + int diagnosticLevel = extractIntArgument(args, "--diag", 0); + String[] filteredArgs = filterArguments(args, "--output", outputPath, "--diag", String.valueOf(diagnosticLevel)); - // Log the filtered arguments for debugging + logger.info("Heap space available is {}", Runtime.getRuntime().maxMemory()); logger.info("Filtered arguments: {}", Arrays.toString(filteredArgs)); - // generate a regex that matches any regex in filteredArgs, or if filteredArgs is empty/null, match everything - var regex = filteredArgs.length == 0 ? ".*" : Arrays.stream(filteredArgs).flatMap(s -> Arrays.stream(s.split("\\s"))).map(s -> "(?:" + s + ")").collect(Collectors.joining("|")); - logger.info("Generated regex pattern: {}", regex); - - // compile regex and do substring matching using find - var pattern = Pattern.compile(regex); - - var datasetNames = getAllDatasetNames().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); - - logger.info("Executing the following datasets: {}", datasetNames); - List results = new ArrayList<>(); - // Add results from checkpoint if present - results.addAll(checkpointManager.getCompletedResults()); - - Catalog testDataCatalog = new TestDataSources().configure() - .addOptionalCatalogs("~/.config/vectordata/catalogs.yaml") - .catalog(); - - DataSetSource datasetSource = DataSetSource.DEFAULT - .and(loadStreamingDataSource(testDataCatalog)); - - // Process datasets from regex patterns - if (!datasetNames.isEmpty()) { - for (var datasetName : datasetNames) { - // Skip already completed datasets - if (checkpointManager.isDatasetCompleted(datasetName)) { - logger.info("Skipping already completed dataset: {}", datasetName); - continue; - } - - logger.info("Loading dataset: {}", datasetName); - try { - DataSet ds = datasetSource.apply(datasetName) - .orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName));; - logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size()); - - String normalizedDatasetName = datasetName; - if (normalizedDatasetName.endsWith(".hdf5")) { - normalizedDatasetName = normalizedDatasetName.substring(0, normalizedDatasetName.length() - ".hdf5".length()); - } - - MultiConfig config = MultiConfig.getDefaultConfig("autoDefault"); - config.dataset = normalizedDatasetName; - logger.info("Using configuration: {}", config); - - List datasetResults = Grid.runAllAndCollectResults(ds, - config.construction.outDegree, - config.construction.efConstruction, - config.construction.neighborOverflow, - config.construction.addHierarchy, - config.construction.getFeatureSets(), - config.construction.getCompressorParameters(), - config.search.getCompressorParameters(), - config.search.topKOverquery, - config.search.useSearchPruning); - results.addAll(datasetResults); + // Execute benchmark using convenience method + BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(filteredArgs); + } - logger.info("Benchmark completed for dataset: {}", datasetName); - // Mark dataset as completed and update checkpoint, passing results - checkpointManager.markDatasetCompleted(datasetName, datasetResults); - } catch (Exception e) { - logger.error("Exception while processing dataset {}", datasetName, e); - } + /** + * Extract a string argument value from command-line args + */ + private static String extractArgument(String[] args, String flag) { + for (int i = 0; i < args.length - 1; i++) { + if (args[i].equals(flag)) { + return args[i + 1]; } } + return null; + } - // Calculate summary statistics + /** + * Extract an integer argument value from command-line args + */ + private static int extractIntArgument(String[] args, String flag, int defaultValue) { + String value = extractArgument(args, flag); + if (value == null) { + return defaultValue; + } try { - SummaryStats stats = BenchmarkSummarizer.summarize(results); - logger.info("Benchmark summary: {}", stats.toString()); - - // Write results to csv file and details to json - File detailsFile = new File(outputPath + ".json"); - ObjectMapper mapper = new ObjectMapper(); - mapper.writerWithDefaultPrettyPrinter().writeValue(detailsFile, results); - - File outputFile = new File(outputPath + ".csv"); - - // Get summary statistics by dataset - Map statsByDataset = BenchmarkSummarizer.summarizeByDataset(results); - - // Write CSV data - try (FileWriter writer = new FileWriter(outputFile)) { - // Write CSV header - writer.write("dataset,QPS,QPS StdDev,Mean Latency,Recall@10,Index Construction Time\n"); - - // Write one row per dataset with average metrics - for (Map.Entry entry : statsByDataset.entrySet()) { - String dataset = entry.getKey(); - SummaryStats datasetStats = entry.getValue(); - - writer.write(dataset + ","); - writer.write(datasetStats.getAvgQps() + ","); - writer.write(datasetStats.getQpsStdDev() + ","); - writer.write(datasetStats.getAvgLatency() + ","); - writer.write(datasetStats.getAvgRecall() + ","); - writer.write(datasetStats.getIndexConstruction() + "\n"); - } - } - - logger.info("Benchmark results written to {} (file exists: {})", outputPath, outputFile.exists()); - // Double check that the file was created and log its size - if (outputFile.exists()) { - logger.info("Output file size: {} bytes", outputFile.length()); - } else { - logger.error("Failed to create output file at {}", outputPath); - } - } catch (Exception e) { - logger.error("Exception during final processing", e); + return Integer.parseInt(value); + } catch (NumberFormatException e) { + logger.warn("Invalid integer value for {}: {}", flag, value); + return defaultValue; } } - @NotNull - public static DataSetSource loadStreamingDataSource(Catalog catalog) { - return name -> { - Optional dsentryOption = catalog.matchOne(name); - if (dsentryOption.isEmpty()) { return Optional.empty(); } - DatasetEntry dsentry = dsentryOption.orElseThrow(() -> new RuntimeException("Unknown dataset: " + name)); - TestDataView tdv = dsentry.select().profile(name); - System.out.println("prebuffering dataset (assumed performance oriented testing)"); - CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer(); - if (statusFuture instanceof ProgressIndicator) { - ((ProgressIndicator)statusFuture).monitorProgress(1000); - } - - TestDataViewWrapper tdw = new TestDataViewWrapper(tdv); - System.out.println("Loaded " + tdw.getName() + " from streaming source, with base vectors prebuffered"); - return Optional.of(tdw); - }; + /** + * Filter out specific arguments and their values from the args array + */ + private static String[] filterArguments(String[] args, String... toFilter) { + return Arrays.stream(args) + .filter(arg -> { + for (String filter : toFilter) { + if (arg.equals(filter)) { + return false; + } + } + return true; + }) + .toArray(String[]::new); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 67fdba468..6b675acab 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -16,116 +16,22 @@ package io.github.jbellis.jvector.example; -import io.github.jbellis.jvector.example.util.CompressorParameters; -import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters; -import io.github.jbellis.jvector.example.util.DataSet; -import io.github.jbellis.jvector.example.util.DataSetSource; -import io.github.jbellis.jvector.example.yaml.DatasetCollection; -import io.github.jbellis.jvector.graph.disk.feature.FeatureId; -import io.github.jbellis.jvector.vector.VectorSimilarityFunction; -import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator; -import io.nosqlbench.vectordata.discovery.TestDataSources; -import io.nosqlbench.vectordata.discovery.TestDataView; -import io.nosqlbench.vectordata.downloader.Catalog; -import io.nosqlbench.vectordata.downloader.DatasetEntry; -import io.nosqlbench.vectordata.spec.datasets.types.DatasetView; -import org.jetbrains.annotations.NotNull; +import io.github.jbellis.jvector.benchframe.BenchFrame; import java.io.IOException; -import java.util.*; -import java.util.concurrent.CompletableFuture; -import java.util.function.Function; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; /** - * Tests GraphIndexes against vectors from various datasets + * Tests GraphIndexes against vectors from various datasets using hardcoded grid parameters. + * + * This class has been refactored to use BenchFrame for modularity and DRY principles. + * All shared functionality is now in reusable modules. + * + * @deprecated Use {@link BenchFrame#likeBench()} directly instead. This class will be removed in a future release. */ +@Deprecated(forRemoval = true) public class Bench { public static void main(String[] args) throws IOException { System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); - - var mGrid = List.of(32); // List.of(16, 24, 32, 48, 64, 96, 128); - var efConstructionGrid = List.of(100); // List.of(60, 80, 100, 120, 160, 200, 400, 600, 800); - var topKGrid = Map.of( - 10, // topK - List.of(1.0, 2.0, 5.0, 10.0), // oq - 100, // topK - List.of(1.0, 2.0) // oq - ); // rerankK = oq * topK - var neighborOverflowGrid = List.of(1.2f); // List.of(1.2f, 2.0f); - var addHierarchyGrid = List.of(true); // List.of(false, true); - var refineFinalGraphGrid = List.of(true); // List.of(false, true); - var usePruningGrid = List.of(true); // List.of(false, true); - List> buildCompression = Arrays.asList( - ds -> new PQParameters(ds.getDimension() / 8, - 256, - ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN, - UNWEIGHTED), - __ -> CompressorParameters.NONE - ); - List> searchCompression = Arrays.asList( - __ -> CompressorParameters.NONE, - // ds -> new CompressorParameters.BQParameters(), - ds -> new PQParameters(ds.getDimension() / 8, - 256, - ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN, - UNWEIGHTED) - ); - List> featureSets = Arrays.asList( - EnumSet.of(FeatureId.NVQ_VECTORS), -// EnumSet.of(FeatureId.NVQ_VECTORS, FeatureId.FUSED_ADC), - EnumSet.of(FeatureId.INLINE_VECTORS) - ); - - // args is list of regexes, possibly needing to be split by whitespace. - // generate a regex that matches any regex in args, or if args is empty/null, match everything - var regex = args.length == 0 ? ".*" : Arrays.stream(args).flatMap(s -> Arrays.stream(s.split("\\s"))).map(s -> "(?:" + s + ")").collect(Collectors.joining("|")); - // compile regex and do substring matching using find - var pattern = Pattern.compile(regex); - - execute(pattern, buildCompression, featureSets, searchCompression, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, topKGrid, usePruningGrid); - } - - private static void execute(Pattern pattern, List> buildCompression, List> featureSets, List> compressionGrid, List mGrid, List efConstructionGrid, List neighborOverflowGrid, List addHierarchyGrid, List refineFinalGraphGrid, Map> topKGrid, List usePruningGrid) throws IOException { - - TestDataSources testDataSources = new TestDataSources().configure().addOptionalCatalogs("~/.config/jvector/catalogs.yaml"); - Catalog testDataCatalog = testDataSources.catalog(); - DataSetSource dsSource = DataSetSource.DEFAULT.and(loadStreamingDataSource(testDataCatalog)); - - var datasetCollection = DatasetCollection.load(); - var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); - - System.out.println("Executing the following datasets: " + datasetNames); - - for (var datasetName : datasetNames) { - DataSet ds = - dsSource.apply(datasetName).orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName)); - Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid); - } - } - - @NotNull - private static DataSetSource loadStreamingDataSource(Catalog catalog) { - return name -> { - Optional dsentryOption = catalog.matchOne(name); - if (dsentryOption.isEmpty()) { return Optional.empty(); } - DatasetEntry dsentry = dsentryOption.orElseThrow(() -> new RuntimeException("Unknown dataset: " + name)); - TestDataView tdv = dsentry.select().profile(name); - System.out.println("prebuffering dataset (assumed performance oriented testing)"); - CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer(); - if (statusFuture instanceof ProgressIndicator) { - ((ProgressIndicator)statusFuture).monitorProgress(1000); - } -// tdv.getQueryVectors().orElseThrow().prebuffer(); -// tdv.getNeighborIndices().orElseThrow().prebuffer(); -// tdv.getNeighborDistances().map(DatasetView::prebuffer); - - TestDataViewWrapper tdw = new TestDataViewWrapper(tdv); - System.out.println("Loaded " + tdw.getName() + " from streaming source."); - return Optional.of(tdw); - }; + BenchFrame.likeBench().execute(args); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java deleted file mode 100644 index dc639f5ea..000000000 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright DataStax, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.github.jbellis.jvector.example; - -import io.github.jbellis.jvector.example.util.*; -import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters; -import io.github.jbellis.jvector.graph.disk.feature.FeatureId; - -import java.io.IOException; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; -import java.util.Map; -import java.util.function.Function; - -import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; - -/** - * Tests GraphIndexes against vectors from a 2D dataset - */ -public class Bench2D { - public static void main(String[] args) throws IOException { - System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); - - var mGrid = List.of(32); // List.of(16, 24, 32, 48, 64, 96, 128); - var efConstructionGrid = List.of(100); // List.of(60, 80, 100, 120, 160, 200, 400, 600, 800); - var topKGrid = Map.of( - 10, // topK - List.of(1.0, 2.0, 5.0, 10.0, 20.0) // oq - ); // rerankK = oq * topK - var neighborOverflowGrid = List.of(1.2f); // List.of(1.2f, 2.0f); - var addHierarchyGrid = List.of(true); // List.of(false, true); - var refineFinalGraphGrid = List.of(true); // List.of(false, true); - var usePruningGrid = List.of(false); // List.of(false, true); - List> buildCompression = Arrays.asList(__ -> CompressorParameters.NONE); - List> searchCompression = Arrays.asList( - __ -> CompressorParameters.NONE, - ds -> new PQParameters(ds.getDimension(), 256, true, UNWEIGHTED) - ); - List> featureSets = Arrays.asList( - EnumSet.of(FeatureId.NVQ_VECTORS), - EnumSet.of(FeatureId.INLINE_VECTORS) - ); - - // 2D grid, built and calculated at runtime - var grid2d = DataSetCreator.create2DGrid(4_000_000, 10_000, 100); - - Grid.runAll(grid2d, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, - featureSets, buildCompression, searchCompression, topKGrid, usePruningGrid); - } -} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java deleted file mode 100644 index 5eeeff736..000000000 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright DataStax, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.github.jbellis.jvector.example; - -import java.util.Map; - -public class BenchResult { - public String dataset; - public Map parameters; - public Map metrics; - - public BenchResult() {} - public BenchResult(String dataset, Map parameters, Map metrics) { - this.dataset = dataset; - this.parameters = parameters; - this.metrics = metrics; - } -} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index d44f916ee..ab2c5991b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -16,90 +16,22 @@ package io.github.jbellis.jvector.example; -import io.github.jbellis.jvector.example.util.DataSet; -import io.github.jbellis.jvector.example.util.DataSetLoader; -import io.github.jbellis.jvector.example.util.DataSetSource; -import io.github.jbellis.jvector.example.yaml.DatasetCollection; -import io.github.jbellis.jvector.example.yaml.MultiConfig; -import io.nosqlbench.vectordata.discovery.TestDataSources; -import io.nosqlbench.vectordata.downloader.Catalog; +import io.github.jbellis.jvector.benchframe.BenchFrame; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; -import java.util.stream.Collectors; /** - * Tests GraphIndexes against vectors from various datasets + * Tests GraphIndexes against vectors from various datasets using YAML-based configuration. + * + * This class has been refactored to use BenchFrame for modularity and DRY principles. + * All shared functionality is now in reusable modules. + * + * @deprecated Use {@link BenchFrame#likeBenchYAML()} directly instead. This class will be removed in a future release. */ +@Deprecated(forRemoval = true) public class BenchYAML { public static void main(String[] args) throws IOException { - // args is one of: - // - a list of regexes, possibly needing to be split by whitespace. - // - a list of YAML files - System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); - - // generate a regex that matches any regex in args, or if args is empty/null, match everything - var regex = args.length == 0 ? ".*" : Arrays.stream(args).flatMap(s -> Arrays.stream(s.split("\\s"))).map(s -> "(?:" + s + ")").collect(Collectors.joining("|")); - // compile regex and do substring matching using find - var pattern = Pattern.compile(regex); - - var datasetCollection = DatasetCollection.load(); - Catalog testDataCatalog = new TestDataSources().configure() - .addOptionalCatalogs("~/.config/vectordata/catalogs.yaml") - .catalog(); - DataSetSource datasetSource = DataSetSource.DEFAULT - .and(AutoBenchYAML.loadStreamingDataSource(testDataCatalog)); - - var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); - - List allConfigs = new ArrayList<>(); - - if (!datasetNames.isEmpty()) { - System.out.println("Executing the following datasets: " + datasetNames); - - for (var datasetName : datasetNames) { - String finalDatasetName = datasetName; - - DataSet ds = datasetSource.apply(datasetName) - .orElseThrow(() -> new IllegalArgumentException( - "Unknown dataset: " + finalDatasetName)); - // DataSet ds = DataSetLoader.loadDataSet(datasetName); - - if (datasetName.endsWith(".hdf5")) { - datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length()); - } - MultiConfig config = MultiConfig.getDefaultConfig(datasetName); - allConfigs.add(config); - } - } - - // get the list of YAML files from args - List configNames = Arrays.stream(args).filter(s -> s.endsWith(".yml")).collect(Collectors.toList()); - - if (!configNames.isEmpty()) { - for (var configName : configNames) { - MultiConfig config = MultiConfig.getDefaultConfig(configName); - allConfigs.add(config); - } - } - - // Execute tests for all the mapped datasets and configs - - for (var config : allConfigs) { - final String datasetName = config.dataset; - - DataSet ds = datasetSource.apply(datasetName) - .orElseThrow(() -> new IllegalArgumentException( - "Unknown dataset: " + datasetName)); - - Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction, - config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph, - config.construction.getFeatureSets(), config.construction.getCompressorParameters(), - config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning); - } + BenchFrame.likeBenchYAML().execute(args); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java index c7095668f..c3e756cb2 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java @@ -17,6 +17,7 @@ package io.github.jbellis.jvector.example; import io.github.jbellis.jvector.disk.ReaderSupplierFactory; +import io.github.jbellis.jvector.benchframe.BenchResult; import io.github.jbellis.jvector.example.benchmarks.AccuracyBenchmark; import io.github.jbellis.jvector.example.benchmarks.BenchmarkTablePrinter; import io.github.jbellis.jvector.example.benchmarks.CountBenchmark; @@ -87,7 +88,7 @@ public class Grid { private static int diagnostic_level; - static void runAll(DataSet ds, + public static void runAll(DataSet ds, List mGrid, List efConstructionGrid, List neighborOverflowGrid, diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java index dba6064ab..88e406551 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java @@ -15,7 +15,7 @@ */ package io.github.jbellis.jvector.example.util; -import io.github.jbellis.jvector.example.BenchResult; +import io.github.jbellis.jvector.benchframe.BenchResult; import java.util.List; import java.util.Map; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java index 4145100b2..d09347c5b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java @@ -16,6 +16,7 @@ package io.github.jbellis.jvector.example.util; import com.fasterxml.jackson.databind.ObjectMapper; +import io.github.jbellis.jvector.benchframe.BenchResult; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,7 +36,7 @@ public class CheckpointManager { private final String checkpointPath; private final ObjectMapper mapper; private final Set completedDatasets; - private final List completedResults; + private final List completedResults; /** * Creates a new CheckpointManager for the given output path. @@ -88,7 +89,7 @@ public boolean isDatasetCompleted(String datasetName) { * @param datasetName The name of the dataset * @param resultsForDataset The results for the dataset */ - public void markDatasetCompleted(String datasetName, List resultsForDataset) { + public void markDatasetCompleted(String datasetName, List resultsForDataset) { completedDatasets.add(datasetName); if (resultsForDataset != null) { completedResults.addAll(resultsForDataset); @@ -123,7 +124,7 @@ public Set getCompletedDatasets() { /** * Returns the list of completed BenchResults. */ - public List getCompletedResults() { + public List getCompletedResults() { return new ArrayList<>(completedResults); } @@ -132,13 +133,13 @@ public List getCompletedResults() */ private static class CheckpointData { private List completedDatasets; - private List completedResults; + private List completedResults; public CheckpointData() { // Default constructor for Jackson } - public CheckpointData(List completedDatasets, List completedResults) { + public CheckpointData(List completedDatasets, List completedResults) { this.completedDatasets = completedDatasets; this.completedResults = completedResults; } @@ -151,11 +152,11 @@ public void setCompletedDatasets(List completedDatasets) { this.completedDatasets = completedDatasets; } - public List getCompletedResults() { + public List getCompletedResults() { return completedResults; } - public void setCompletedResults(List completedResults) { + public void setCompletedResults(List completedResults) { this.completedResults = completedResults; } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java index 7b6eb4849..2761e903f 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java @@ -16,10 +16,16 @@ package io.github.jbellis.jvector.example.util; -import io.jhdf.api.Dataset; +import io.github.jbellis.jvector.benchframe.TestDataViewWrapper; +import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator; +import io.nosqlbench.vectordata.discovery.TestDataSources; +import io.nosqlbench.vectordata.discovery.TestDataView; +import io.nosqlbench.vectordata.downloader.Catalog; +import io.nosqlbench.vectordata.downloader.DatasetEntry; import java.io.IOException; import java.util.Optional; +import java.util.concurrent.CompletableFuture; import java.util.function.Function; public class DataSetLoader implements DataSetSource { @@ -61,4 +67,86 @@ public Optional apply(String name) { return Optional.empty(); } }; + + /** + * VectorData downloader that loads datasets from the vectordata catalog system. + * Supports optional additional catalogs via VECTORDATA_CATALOGS environment variable. + * + * Environment variable format: + * VECTORDATA_CATALOGS=~/.config/custom1/catalogs.yaml,~/.config/custom2/catalogs.yaml + */ + public static final DataSetSource vectorDataDownloader = new DataSetSource() { + private final Catalog catalog = initializeCatalog(); + + private Catalog initializeCatalog() { + TestDataSources sources = new TestDataSources().configure(); + + // Add additional catalogs from environment variable + String envCatalogs = System.getenv("VECTORDATA_CATALOGS"); + if (envCatalogs != null && !envCatalogs.trim().isEmpty()) { + String[] catalogPaths = envCatalogs.split(","); + for (String catalogPath : catalogPaths) { + String trimmedPath = catalogPath.trim(); + if (!trimmedPath.isEmpty()) { + System.out.println("Adding optional catalog from VECTORDATA_CATALOGS: " + trimmedPath); + sources.addOptionalCatalogs(trimmedPath); + } + } + } + + return sources.catalog(); + } + + @Override + public Optional apply(String name) { + name = name.contains(":") ? name : name + ":default"; + + TestDataView tdv = catalog.profile(name); + System.out.println("prebuffering dataset '" + name + "' (assumed performance oriented testing)"); + + CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer(); + if (statusFuture instanceof ProgressIndicator) { + ((ProgressIndicator) statusFuture).monitorProgress(1000); + } + + TestDataViewWrapper tdw = new TestDataViewWrapper(tdv); + System.out.println("Loaded " + tdw.getName() + " from streaming source"); + return Optional.of(tdw); + } + }; + + /** + * Creates a VectorDataDownloader with a specific catalog path. + * Use this when you need a custom catalog location programmatically. + * For most use cases, prefer using the VECTORDATA_CATALOGS environment variable instead. + * + * @param catalogPath path to the catalog YAML file (e.g., "~/.config/vectordata/catalogs.yaml") + * @return a DataSetSource that can load from the specified catalog + */ + public static DataSetSource createVectorDataDownloader(String catalogPath) { + Catalog catalog = new TestDataSources() + .configure() + .addOptionalCatalogs(catalogPath) + .catalog(); + + return name -> { + Optional dsentryOption = catalog.matchOne(name); + if (dsentryOption.isEmpty()) { + return Optional.empty(); + } + + DatasetEntry dsentry = dsentryOption.get(); + TestDataView tdv = dsentry.select().profile(name); + + System.out.println("prebuffering dataset (assumed performance oriented testing)"); + CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer(); + if (statusFuture instanceof ProgressIndicator) { + ((ProgressIndicator) statusFuture).monitorProgress(1000); + } + + TestDataViewWrapper tdw = new TestDataViewWrapper(tdv); + System.out.println("Loaded " + tdw.getName() + " from streaming source"); + return Optional.of(tdw); + }; + } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java index 9743e66ad..b8e2042c8 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java @@ -20,7 +20,8 @@ import java.util.function.Function; public interface DataSetSource extends Function> { - public DataSetSource DEFAULT = new DataSetLoader(DataSetLoader.HDF5Loader, DataSetLoader.FVecsDownloader); + public DataSetSource DEFAULT = new DataSetLoader(DataSetLoader.HDF5Loader, DataSetLoader.FVecsDownloader, + DataSetLoader.vectorDataDownloader); public default DataSetSource and(DataSetSource... loaders) { return new DataSetSource() { diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java index e52ce78b7..f71a2c64f 100644 --- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java @@ -15,7 +15,7 @@ */ package io.github.jbellis.jvector.example.util; -import io.github.jbellis.jvector.example.BenchResult; +import io.github.jbellis.jvector.benchframe.BenchResult; import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; import org.junit.jupiter.api.Test; diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java index 3dbf7f403..163840193 100644 --- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java @@ -15,7 +15,7 @@ */ package io.github.jbellis.jvector.example.util; -import io.github.jbellis.jvector.example.BenchResult; +import io.github.jbellis.jvector.benchframe.BenchResult; import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; import java.util.ArrayList; From 94f7510f0bec4de25e88ae8f83bc1a390feda56b Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Fri, 10 Oct 2025 15:16:11 -0500 Subject: [PATCH 24/29] gha workflow update for run-bench.yml --- .github/workflows/run-bench.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-bench.yml b/.github/workflows/run-bench.yml index a6dad0d90..610cd3ef1 100644 --- a/.github/workflows/run-bench.yml +++ b/.github/workflows/run-bench.yml @@ -129,7 +129,7 @@ jobs: env: DATASET_HASH: ${{ secrets.DATASETS_KEYPATH }} run: | - # Check if jvector-examples directory and AutoBenchYAML class exist + # Check if jvector-examples directory exists if [ ! -d "jvector-examples" ]; then echo "Warning: jvector-examples directory not found in branch ${{ matrix.branch }}. Skipping benchmark." exit 0 @@ -176,12 +176,12 @@ jobs: java ${{ matrix.jdk >= 20 && '--enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector' || '' }} \ ${{ matrix.jdk >= 22 && '-Djvector.experimental.enable_native_vectorization=true' || '' }} \ -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/heap_dump/ -Xmx${HALF_MEM_GB}g \ - -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.example.AutoBenchYAML --output ${SAFE_BRANCH}-bench-results dpr-1M + -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.benchframe.BenchFrameCLI autobenchyaml --output ${SAFE_BRANCH}-bench-results dpr-1M else java ${{ matrix.jdk >= 20 && '--enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector' || '' }} \ ${{ matrix.jdk >= 22 && '-Djvector.experimental.enable_native_vectorization=true' || '' }} \ -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/heap_dump/ -Xmx${HALF_MEM_GB}g \ - -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.example.AutoBenchYAML --output ${SAFE_BRANCH}-bench-results${BENCH_SUFFIX:+ }${BENCH_ARG} + -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.benchframe.BenchFrameCLI autobenchyaml --output ${SAFE_BRANCH}-bench-results${BENCH_SUFFIX:+ }${BENCH_ARG} fi # Move the results to the benchmark_results directory From 731769ddb50fc7207bc1473cdf4ad5821e4137c2 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Fri, 10 Oct 2025 16:11:03 -0500 Subject: [PATCH 25/29] BenchFrame consolidation, javadoc updates --- benchmarks-jmh/pom.xml | 15 +++ jvector-examples/pom.xml | 17 ++- .../jvector/benchframe/BenchFrame.java | 2 +- .../jvector/benchframe/BenchFrameConfig.java | 2 +- .../jvector/benchframe/ResultHandler.java | 2 +- .../jvector/benchframe/package-info.java | 110 +++++++++++++----- jvector-native/pom.xml | 11 ++ jvector-twenty/pom.xml | 11 ++ 8 files changed, 137 insertions(+), 33 deletions(-) diff --git a/benchmarks-jmh/pom.xml b/benchmarks-jmh/pom.xml index c82ee2707..78654edcc 100644 --- a/benchmarks-jmh/pom.xml +++ b/benchmarks-jmh/pom.xml @@ -94,6 +94,21 @@ + + org.apache.maven.plugins + maven-javadoc-plugin + + + --add-modules=jdk.incubator.vector + + 22 + false + true + + io.github.jbellis:* + + + \ No newline at end of file diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml index ad01ae381..7c6fee70e 100644 --- a/jvector-examples/pom.xml +++ b/jvector-examples/pom.xml @@ -52,8 +52,21 @@
- - + + org.apache.maven.plugins + maven-javadoc-plugin + + + --add-modules=jdk.incubator.vector + + 22 + false + true + + io.github.jbellis:* + + + diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java index 6dd956be5..693816acd 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java @@ -184,7 +184,7 @@ public void execute(DataSet dataset) throws IOException { *
    *
  • Checking if dataset should be skipped (already completed in checkpoint)
  • *
  • Loading the dataset from the configured {@link DataSetSource}
  • - *
  • Loading configuration from the configured {@link BenchFrameConfigSource}
  • + *
  • Loading configuration (either shared config or per-dataset function)
  • *
  • Executing the benchmark
  • *
  • Recording completion in checkpoint if enabled
  • *
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java index a3991d5b9..edfc556d6 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java @@ -55,8 +55,8 @@ * BenchFrameConfig defaults = BenchFrameConfig.createBenchDefaults(); * } * - * @see BenchFrameConfigSource * @see MultiConfig + * @see BenchFrame */ public class BenchFrameConfig { // Dataset identification diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java index da972b984..050681763 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java @@ -79,7 +79,7 @@ public interface ResultHandler { /** * Creates a no-op result handler that does nothing with results. Console output - * is already handled by {@link Grid} during benchmark execution. + * is already handled by Grid during benchmark execution. * This matches the behavior of the original Bench.java and BenchYAML.java. * * @return a result handler that performs no additional output diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java index e303e3ae5..d73b951e5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java @@ -16,18 +16,47 @@ /** * Unified benchmark framework for JVector graph indexes. This package consolidates the functionality - * from the legacy benchmark classes (Bench, BenchYAML, AutoBenchYAML, Bench2D) into a modular, - * composable architecture using the Strategy pattern. + * from the legacy benchmark classes (Bench, BenchYAML, AutoBenchYAML) into a modular, + * composable architecture using closures and strategy interfaces. * - *

Usage

- * If you are just wanting to run the bench commands you are used to, then you can do it this way: - *
    - *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrame#likeBench()}.execute(...)
  • - *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrame#likeBenchYAML()}.execute(...)
  • - *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrame#likeAutoBenchYAML(java.lang.String, int)}.execute(...)
  • - *
+ *

Quick Start

* - * The rest of the docs here are more for development on the BenchFrame itself. + *

Command-Line Interface

+ * The recommended way to run benchmarks is via the CLI: + *
+ * # Run with hardcoded parameters (original Bench.java)
+ * benchframe bench dataset-name
+ *
+ * # Run with YAML configuration (original BenchYAML.java)
+ * benchframe benchyaml dataset-name
+ *
+ * # Run CI/CD mode with checkpointing (original AutoBenchYAML.java)
+ * benchframe autobenchyaml -o results/output dataset-name
+ *
+ * # List available datasets
+ * benchframe datasets
+ *
+ * # Access full nbvectors functionality
+ * benchframe nbvectors --help
+ * 
+ * + *

Programmatic Usage

+ * For library usage, use the convenience factory methods: + *
{@code
+ * // Hardcoded defaults (Bench-style)
+ * BenchFrame.likeBench().execute(args);
+ *
+ * // YAML configuration (BenchYAML-style)
+ * BenchFrame.likeBenchYAML().execute(args);
+ *
+ * // CI/CD with checkpointing (AutoBenchYAML-style)
+ * BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(args);
+ * }
+ * + *
+ *

Developer Documentation

+ * + * The sections below provide detailed information for developers working on the BenchFrame itself. * *

Package Overview

* The benchframe package provides a flexible framework for benchmarking JVector's approximate @@ -47,9 +76,8 @@ *

Configuration

*
    *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrameConfig} - Immutable configuration class - * encapsulating all benchmark parameters
  • - *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrameConfigSource} - Strategy interface for - * loading configurations from different sources (hardcoded, YAML, etc.)
  • + * encapsulating all benchmark parameters. Can be used as a single shared config or via a + * Function for per-dataset configuration (e.g., YAML) *
* *

Result Handling

@@ -68,22 +96,42 @@ * *

Usage Patterns

* - *

Command-Line Usage

- * The simplest way to use the framework is through the CLI: + *

Available CLI Subcommands

+ *
    + *
  • bench - Run with hardcoded grid parameters (original Bench.java)
  • + *
  • benchyaml - Run with YAML-based configuration (original BenchYAML.java)
  • + *
  • autobenchyaml - CI/CD mode with checkpointing and file output (original AutoBenchYAML.java)
  • + *
  • datasets - List and manage vector datasets (delegates to nbvectors)
  • + *
  • nbvectors - Access full nbvectors CLI functionality
  • + *
+ * + *

CLI Examples

*
- * # Run with hardcoded parameters (Bench-style)
- * java -jar benchframe.jar bench "dataset1|dataset2"
+ * # Run with hardcoded parameters
+ * benchframe bench dataset-name
+ *
+ * # Run with YAML configuration
+ * benchframe benchyaml dataset-name
  *
- * # Run with YAML configuration (BenchYAML-style)
- * java -jar benchframe.jar yaml
+ * # Run CI/CD mode with checkpointing (--output required)
+ * benchframe autobenchyaml -o results/output dataset-name
+ * benchframe autobenchyaml -o results/output -d 2 cap-1M
  *
- * # Run in CI/CD mode with checkpointing (AutoBenchYAML-style)
- * java -jar benchframe.jar auto -o results/benchmark -d 2
+ * # List available datasets
+ * benchframe datasets
+ * benchframe datasets search cohere
  *
- * # Run with synthetic 2D data (Bench2D-style)
- * java -jar benchframe.jar 2d -n 1000000 -q 10000
+ * # Access nbvectors functionality
+ * benchframe nbvectors --help
+ * benchframe nbvectors catalogs list
  * 
* + *

Environment Variables

+ *
    + *
  • VECTORDATA_CATALOGS - Comma-separated list of additional catalog YAML files + * to load (e.g., "~/.config/custom/catalogs.yaml,~/work/catalogs.yaml")
  • + *
+ * *

Programmatic Usage - Factory Methods

* Factory methods provide pre-configured instances matching legacy behavior: *
{@code
@@ -103,25 +151,31 @@
  * 

Programmatic Usage - Custom Configuration

* The Builder API provides fine-grained control over all aspects: *
{@code
+ * // With a single shared config
  * BenchFrame frame = new BenchFrame.Builder()
  *     .withDatasetNames(List.of("dataset1", "dataset2"))
- *     .withConfigurationProvider(BenchFrameConfigSource.fromYAML())
+ *     .withConfig(BenchFrameConfig.createBenchDefaults())
  *     .withDataSetSource(DataSetSource.DEFAULT)
  *     .withResultHandler(ResultHandler.toFiles("results/benchmark"))
  *     .withCheckpointStrategy(CheckpointStrategy.fileBasedCheckpointing("results/checkpoint"))
- *     .collectResults(true)
  *     .withDiagnosticLevel(2)
  *     .build();
  *
+ * // With per-dataset config function (like YAML)
+ * BenchFrame frame = new BenchFrame.Builder()
+ *     .withDatasetNames(List.of("dataset1", "dataset2"))
+ *     .withConfigFunction(name -> loadCustomConfig(name))
+ *     .build();
+ *
  * frame.execute(new String[]{".*"});
  * }
* *

Extension Points

- * The framework is designed for extension through its strategy interfaces: + * The framework is designed for extension through closures and strategy interfaces: * - *

Custom Configuration Source

+ *

Custom Configuration Function

*
{@code
- * BenchFrameConfigSource customSource = datasetName -> {
+ * Function customConfigFn = datasetName -> {
  *     // Load from database, REST API, etc.
  *     return new BenchFrameConfig.Builder()
  *         .withDatasetName(datasetName)
diff --git a/jvector-native/pom.xml b/jvector-native/pom.xml
index daf84fe6a..130e19d48 100644
--- a/jvector-native/pom.xml
+++ b/jvector-native/pom.xml
@@ -49,6 +49,17 @@
                     
                 
             
+            
+                org.apache.maven.plugins
+                maven-javadoc-plugin
+                
+                    
+                        --add-modules=jdk.incubator.vector
+                    
+                    22
+                    false
+                
+            
         
     
     
diff --git a/jvector-twenty/pom.xml b/jvector-twenty/pom.xml
index ae6aa659b..53f81ecb9 100644
--- a/jvector-twenty/pom.xml
+++ b/jvector-twenty/pom.xml
@@ -39,6 +39,17 @@
                     
                 
             
+            
+                org.apache.maven.plugins
+                maven-javadoc-plugin
+                
+                    
+                        --add-modules=jdk.incubator.vector
+                    
+                    22
+                    false
+                
+            
 
         
 

From 8a714f5373c5071b40031237f3370109c49a1a65 Mon Sep 17 00:00:00 2001
From: Jonathan Shook 
Date: Fri, 10 Oct 2025 16:12:12 -0500
Subject: [PATCH 26/29] remove stale testing classes

---
 .../jvector/example/testrig/BenchHarness.java | 119 ------------------
 .../example/testrig/commands/Bench_CMD.java   | 104 ---------------
 .../example/testrig/commands/Run_CMD.java     | 103 ---------------
 .../example/testrig/commands/TestRig_CMD.java |  40 ------
 4 files changed, 366 deletions(-)
 delete mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java
 delete mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java
 delete mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java
 delete mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java

diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java
deleted file mode 100644
index 1fb786969..000000000
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/BenchHarness.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.github.jbellis.jvector.example.testrig;
-
-
-import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator;
-import io.nosqlbench.vectordata.discovery.TestDataView;
-import io.nosqlbench.vectordata.downloader.DatasetEntry;
-import io.nosqlbench.vectordata.spec.datasets.types.BaseVectors;
-
-import java.util.Arrays;
-import java.util.concurrent.*;
-
-public class BenchHarness implements Runnable {
-
-  private final DatasetEntry datasetEntry;
-  private final String profile;
-  private final int concurrency;
-  private final ExecutorService virtualThreadExecutor;
-  private final Semaphore semaphore;
-
-  public BenchHarness(
-      io.nosqlbench.vectordata.downloader.DatasetEntry datasetEntry,
-      String profile
-  )
-  {
-    this(datasetEntry, profile, 1);
-  }
-
-  public BenchHarness(
-      io.nosqlbench.vectordata.downloader.DatasetEntry datasetEntry,
-      String profile,
-      int concurrency
-  )
-  {
-    this.datasetEntry = datasetEntry;
-    this.profile = profile;
-    this.concurrency = concurrency;
-    this.virtualThreadExecutor = Executors.newCachedThreadPool();
-    this.semaphore = new Semaphore(concurrency);
-  }
-
-  @Override
-  public void run() {
-    TestDataView testDataView = datasetEntry.select().profile(profile);
-    smokeTestDataLoad(testDataView);
-  }
-
-  private void smokeTestDataLoad(TestDataView testDataView) {
-      BaseVectors bv = testDataView.getBaseVectors().orElseThrow();
-
-      System.out.println("Prebuffering...");
-      CompletableFuture prebuffer = bv.prebuffer();
-      if (prebuffer instanceof ProgressIndicator) {
-          ((ProgressIndicator)prebuffer).monitorProgress(1000);
-      }
-      prebuffer.join();
-      System.out.println("Prebuffered");
-
-      float[] v1 = bv.get(1);
-      System.out.println(Arrays.toString(v1));
-
-    float[] vend = bv.get(bv.getCount() - 1);
-    System.out.println(Arrays.toString(vend));
-
-    /// Create tasks for processing vectors concurrently
-    CompletableFuture[] futures = new CompletableFuture[100];
-
-    for (int i = 0; i < 100; i++) {
-      final int index = i;
-      futures[i] = CompletableFuture.runAsync(() -> {
-        try {
-          semaphore.acquire();
-      try {
-            /// This will be a stepping through the space of vectors
-            int idx = (int) ((float)index / 100 * bv.getCount());
-            float[] v = bv.get(idx);
-            System.out.println(Arrays.toString(v));
-          } finally {
-            semaphore.release();
-          }
-        } catch (InterruptedException e) {
-          Thread.currentThread().interrupt();
-          throw new RuntimeException(e);
-        }
-      }, virtualThreadExecutor);
-    }
-
-    /// Wait for all tasks to complete
-    CompletableFuture.allOf(futures).join();
-
-    /// Shutdown the executor
-    virtualThreadExecutor.shutdown();
-    try {
-      if (!virtualThreadExecutor.awaitTermination(60, TimeUnit.SECONDS)) {
-        virtualThreadExecutor.shutdownNow();
-      }
-    } catch (InterruptedException e) {
-      virtualThreadExecutor.shutdownNow();
-      Thread.currentThread().interrupt();
-    }
-  }
-
-
-}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java
deleted file mode 100644
index 2665f46a6..000000000
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Bench_CMD.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.github.jbellis.jvector.example.testrig.commands;
-
-import io.github.jbellis.jvector.example.testrig.BenchHarness;
-import io.nosqlbench.vectordata.VectorTestData;
-import io.nosqlbench.vectordata.discovery.TestDataSources;
-import io.nosqlbench.vectordata.downloader.Catalog;
-import io.nosqlbench.vectordata.downloader.DatasetEntry;
-import picocli.CommandLine;
-
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.stream.Collectors;
-
-@CommandLine.Command(name = "bench", description = "run example benchmarks")
-public class Bench_CMD implements Callable {
-
-  public static class ExpanderExample implements Iterable {
-    @Override
-    public java.util.Iterator iterator() {
-      return VectorTestData.catalogs().catalog().datasets().stream().map(DatasetEntry::name)
-          .collect(Collectors.toList()).iterator();
-    }
-  }
-
-  @CommandLine.Option(names = {"--catalog"},
-      description = "A directory, remote url, or other catalog container")
-  List catalogs = new ArrayList<>();
-
-    @CommandLine.Option(names = {"--optional-catalog"},
-            description = "A configuration file to use",
-            split = ",",
-            defaultValue = "~/.config/jvector/catalogs.yaml,~/.config/vectordata/catalogs.yaml")
-    List optionalCatalogs;
-
-  @CommandLine.Option(names = {"-d", "--dataset"},
-      description = "Dataset to use",
-      completionCandidates = ExpanderExample.class)
-  private List dsnames;
-
-  @CommandLine.Option(names = {"-p", "--profile"},
-      description = "Profile to use",
-      defaultValue = "default")
-  private String profile = "default";
-
-  @CommandLine.Option(names = {"-c", "--concurrency"},
-      description = "Number of concurrent threads",
-      defaultValue = "1")
-  private int concurrency = 1;
-
-  public static void main(String[] args) {
-    Bench_CMD command = new Bench_CMD();
-    CommandLine commandLine = new CommandLine(command).setCaseInsensitiveEnumValuesAllowed(true)
-        .setOptionsCaseInsensitive(true);
-    int exitCode = commandLine.execute(args);
-    System.exit(exitCode);
-  }
-
-  @Override
-  public Integer call() throws Exception {
-    System.out.println("Test Rig run with datasets: " + dsnames);
-    for (String dsname : dsnames) {
-      String[] nameparts = dsname.split(":+", 2);
-      String _name = null, _profile = this.profile;
-      switch (nameparts.length) {
-        case 2:
-          _profile = nameparts[1];
-        case 1:
-          _name = nameparts[0];
-          break;
-      }
-
-      System.out.println("Using testdata source " + _name);
-      System.out.println("Using profile " + _profile);
-
-      Catalog catalog = new TestDataSources().addOptionalCatalogs(optionalCatalogs).catalog();
-      DatasetEntry ds = catalog.
-              findExact(_name).orElseThrow();
-      //          VectorTestData.catalogs().catalog().findExact(_name).orElseThrow();
-      BenchHarness harness = new BenchHarness(ds, _profile, concurrency);
-
-      harness.run();
-
-    }
-    return 0;
-  }
-}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java
deleted file mode 100644
index 7597da8f6..000000000
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/Run_CMD.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.github.jbellis.jvector.example.testrig.commands;
-
-import io.github.jbellis.jvector.example.testrig.BenchHarness;
-import io.nosqlbench.vectordata.VectorTestData;
-import io.nosqlbench.vectordata.discovery.TestDataSources;
-import io.nosqlbench.vectordata.downloader.Catalog;
-import io.nosqlbench.vectordata.downloader.DatasetEntry;
-import picocli.CommandLine;
-
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.stream.Collectors;
-
-@CommandLine.Command(name = "run", description = "Run a testrig command")
-public class Run_CMD implements Callable {
-
-  public static class ExpanderExample implements Iterable {
-    @Override
-    public java.util.Iterator iterator() {
-      return VectorTestData.catalogs().catalog().datasets().stream().map(DatasetEntry::name)
-          .collect(Collectors.toList()).iterator();
-    }
-  }
-
-  @CommandLine.Option(names = {"--catalog"},
-      description = "A directory, remote url, or other catalog container")
-  List catalogs = new ArrayList<>();
-
-    @CommandLine.Option(names = {"--optional-catalog"},
-            description = "A configuration file to use",
-            split = ",",
-            defaultValue = "~/.config/jvector/catalogs.yaml,~/.config/vectordata/catalogs.yaml")
-    List optionalCatalogs;
-
-  @CommandLine.Option(names = {"-d", "--dataset"},
-      description = "Dataset to use",
-      completionCandidates = ExpanderExample.class)
-  private List dsnames;
-
-  @CommandLine.Option(names = {"-p", "--profile"},
-      description = "Profile to use",
-      defaultValue = "default")
-  private String profile = "default";
-
-  @CommandLine.Option(names = {"-c", "--concurrency"},
-      description = "Number of concurrent threads",
-      defaultValue = "1")
-  private int concurrency = 1;
-
-  public static void main(String[] args) {
-    Run_CMD command = new Run_CMD();
-    CommandLine commandLine = new CommandLine(command).setCaseInsensitiveEnumValuesAllowed(true)
-        .setOptionsCaseInsensitive(true);
-    int exitCode = commandLine.execute(args);
-    System.exit(exitCode);
-  }
-
-  @Override
-  public Integer call() throws Exception {
-    System.out.println("Test Rig run with datasets: " + dsnames);
-    for (String dsname : dsnames) {
-      String[] nameparts = dsname.split(":+", 2);
-      String _name = null, _profile = this.profile;
-      switch (nameparts.length) {
-        case 2:
-          _profile = nameparts[1];
-        case 1:
-          _name = nameparts[0];
-          break;
-      }
-
-      System.out.println("Using testdata source " + _name);
-      System.out.println("Using profile " + _profile);
-
-      Catalog catalog = new TestDataSources().addOptionalCatalogs(optionalCatalogs).catalog();
-      DatasetEntry ds = catalog.findExact(_name).orElseThrow();
-      //          VectorTestData.catalogs().catalog().findExact(_name).orElseThrow();
-      BenchHarness harness = new BenchHarness(ds, _profile, concurrency);
-
-      harness.run();
-
-    }
-    return 0;
-  }
-}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java
deleted file mode 100644
index 29e440cf5..000000000
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/testrig/commands/TestRig_CMD.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.github.jbellis.jvector.example.testrig.commands;
-
-import io.nosqlbench.command.datasets.CMD_datasets;
-import picocli.AutoComplete;
-import picocli.CommandLine;
-
-@CommandLine.Command(name = "testrig",
-    header = "JVector Test Rig",
-    mixinStandardHelpOptions = true,
-    description = "JVector Test Rig",
-    subcommands = {CommandLine.HelpCommand.class, AutoComplete.GenerateCompletion.class,
-                   Run_CMD.class, CMD_datasets.class})
-public class TestRig_CMD {
-
-  public static void main(String[] args) {
-    @SuppressWarnings("InstantiationOfUtilityClass") TestRig_CMD command = new TestRig_CMD();
-             CommandLine commandLine =
-             new CommandLine(command).setCaseInsensitiveEnumValuesAllowed(true)
-        .setOptionsCaseInsensitive(true);
-    int exitCode = commandLine.execute(args);
-    System.exit(exitCode);
-  }
-
-}

From 88396e351e2baaa14d4d36dae68d3489bb82f4c8 Mon Sep 17 00:00:00 2001
From: Jonathan Shook 
Date: Fri, 10 Oct 2025 16:15:29 -0500
Subject: [PATCH 27/29] add missing license header

---
 .../jvector/benchframe/BenchFrameCLI.java        | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java
index 7c8c9512e..dabba83ac 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package io.github.jbellis.jvector.benchframe;
 
 import picocli.CommandLine;

From 393dc2c3347c40571cedfabc97ab1e24e725720c Mon Sep 17 00:00:00 2001
From: Jonathan Shook 
Date: Mon, 13 Oct 2025 11:24:42 -0500
Subject: [PATCH 28/29] add missing method

---
 .../jbellis/jvector/benchframe/BenchFrameCLI.java      | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java
index dabba83ac..f1c620139 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java
@@ -240,4 +240,14 @@ public Integer call() throws Exception {
         }
     }
 
+    /**
+     * Main entry point for command-line execution.
+     *
+     * @param args command-line arguments
+     */
+    public static void main(String[] args) {
+        int exitCode = new CommandLine(new BenchFrameCLI()).execute(args);
+        System.exit(exitCode);
+    }
+
 }

From 8462bd3ab1699eeedbfcf9e220105262c72cd95f Mon Sep 17 00:00:00 2001
From: Jonathan Shook 
Date: Mon, 13 Oct 2025 11:53:22 -0500
Subject: [PATCH 29/29] optionally download dataset

---
 .../jvector/example/HelloVectorWorld.java     |  3 +-
 .../jvector/example/util/DataSetLoader.java   | 39 +++++++++++++++++--
 .../jvector/example/util/DownloadHelper.java  | 12 +++---
 3 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java
index a09d1a0e7..a53e682dd 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java
@@ -31,7 +31,8 @@ public static void main(String[] args) throws IOException {
 
         String datasetName = "ada002-100k";
 
-        var mfd = DownloadHelper.maybeDownloadFvecs(datasetName);
+        var mfd = DownloadHelper.maybeDownloadFvecs(datasetName)
+                .orElseThrow(() -> new IllegalArgumentException("Unknown dataset: " + datasetName));
         DataSet ds = mfd.load();
 
         MultiConfig config = MultiConfig.getConfig(datasetName);
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java
index 2761e903f..75b764bbb 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java
@@ -26,11 +26,10 @@
 import java.io.IOException;
 import java.util.Optional;
 import java.util.concurrent.CompletableFuture;
-import java.util.function.Function;
 
 public class DataSetLoader implements DataSetSource {
 
-  private final Function>[] loaders;
+  private final DataSetSource[] loaders;
 
   public DataSetLoader(DataSetSource... loaders) {
     this.loaders = loaders;
@@ -38,15 +37,30 @@ public DataSetLoader(DataSetSource... loaders) {
 
   @Override
   public Optional apply(String name) {
+    for (DataSetSource loader : loaders) {
+      Optional result = loader.apply(name);
+      if (result.isPresent()) {
+        return result;
+      }
+    }
     return Optional.empty();
   }
 
+  @Override
+  public String toString() {
+    return "DataSetLoader{loaders=" + loaders.length + "}";
+  }
+
   public final static DataSetSource FVecsDownloader = new DataSetSource() {
     @Override
     public Optional apply(String name) {
-      var mfd = DownloadHelper.maybeDownloadFvecs(name);
+      Optional mfdOpt = DownloadHelper.maybeDownloadFvecs(name);
+      if (mfdOpt.isEmpty()) {
+        return Optional.empty();
+      }
+
       try {
-        var ds = mfd.load();
+        var ds = mfdOpt.get().load();
         return Optional.of(ds);
       } catch (IOException e) {
         System.err.println("error while trying to load dataset: " + e + ", this error handling "
@@ -54,6 +68,11 @@ public Optional apply(String name) {
         return Optional.empty();
       }
     }
+
+    @Override
+    public String toString() {
+      return "FVecsDownloader";
+    }
   };
 
   public final static DataSetSource HDF5Loader = new DataSetSource() {
@@ -66,6 +85,11 @@ public Optional apply(String name) {
       }
       return Optional.empty();
     }
+
+    @Override
+    public String toString() {
+      return "HDF5Loader";
+    }
   };
 
   /**
@@ -113,6 +137,13 @@ public Optional apply(String name) {
       System.out.println("Loaded " + tdw.getName() + " from streaming source");
       return Optional.of(tdw);
     }
+
+    @Override
+    public String toString() {
+      String envCatalogs = System.getenv("VECTORDATA_CATALOGS");
+      return "VectorDataDownloader{defaultCatalog=~/.config/vectordata/catalogs.yaml" +
+             (envCatalogs != null ? ", additionalCatalogs=" + envCatalogs : "") + "}";
+    }
   };
 
   /**
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java
index 8725a6f65..052388d3d 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java
@@ -36,6 +36,7 @@
 import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
 import java.util.List;
+import java.util.Optional;
 import java.util.Set;
 
 public class DownloadHelper {
@@ -55,11 +56,11 @@ private static S3AsyncClientBuilder s3AsyncClientBuilder() {
                 .credentialsProvider(AnonymousCredentialsProvider.create());
     }
 
-    public static MultiFileDatasource maybeDownloadFvecs(String name) {
+    public static Optional maybeDownloadFvecs(String name) {
         String bucket = infraDatasets.contains(name) ? infraBucketName : bucketName;
         var mfd = MultiFileDatasource.byName.get(name);
         if (mfd == null) {
-            throw new IllegalArgumentException("Unknown dataset: " + name);
+            return Optional.empty();
         }
         // TODO how to detect and recover from incomplete downloads?
 
@@ -68,6 +69,7 @@ public static MultiFileDatasource maybeDownloadFvecs(String name) {
             Files.createDirectories(Paths.get(fvecDir).resolve(mfd.directory()));
         } catch (IOException e) {
             System.err.println("Failed to create directory: " + e.getMessage());
+            return Optional.empty();
         }
 
         try (S3AsyncClient s3Client = s3AsyncClientBuilder().build()) {
@@ -104,11 +106,11 @@ public static MultiFileDatasource maybeDownloadFvecs(String name) {
             }
             tm.close();
         } catch (Exception e) {
-            System.out.println("Error downloading data from S3: " + e.getMessage());
-            System.exit(1);
+            System.err.println("Error downloading data from S3: " + e.getMessage());
+            return Optional.empty();
         }
 
-        return mfd;
+        return Optional.of(mfd);
     }
 
     public static void maybeDownloadHdf5(String datasetName) {