Skip to content
116 changes: 116 additions & 0 deletions cpp/src/arrow/adapters/orc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <algorithm>
#include <list>
#include <memory>
#include <optional>
#include <sstream>
#include <string>
#include <vector>
Expand All @@ -30,9 +31,11 @@

#include "arrow/adapters/orc/util.h"
#include "arrow/builder.h"
#include "arrow/compute/expression.h"
#include "arrow/io/interfaces.h"
#include "arrow/memory_pool.h"
#include "arrow/record_batch.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/table_builder.h"
Expand Down Expand Up @@ -100,6 +103,119 @@ constexpr uint64_t kOrcNaturalWriteSize = 128 * 1024;

using internal::checked_cast;

// Statistics container for min/max values from ORC stripe statistics
struct MinMaxStats {
int64_t min;
int64_t max;
bool has_null;

MinMaxStats(int64_t min_val, int64_t max_val, bool null_flag)
: min(min_val), max(max_val), has_null(null_flag) {}
};

// Extract stripe-level statistics for a specific column
// Returns nullopt if statistics are missing or invalid
std::optional<MinMaxStats> ExtractStripeStatistics(
const std::unique_ptr<liborc::StripeStatistics>& stripe_stats,
uint32_t orc_column_id,
const std::shared_ptr<DataType>& field_type) {

if (!stripe_stats) {
return std::nullopt; // No statistics available
}

// Get column statistics
const liborc::ColumnStatistics* col_stats =
stripe_stats->getColumnStatistics(orc_column_id);

if (!col_stats) {
return std::nullopt; // Column statistics missing
}

// Only INT64 support in this initial implementation
if (field_type->id() != Type::INT64) {
return std::nullopt; // Unsupported type
}

// Dynamic cast to get integer-specific statistics
const auto* int_stats =
dynamic_cast<const liborc::IntegerColumnStatistics*>(col_stats);

if (!int_stats) {
return std::nullopt; // Wrong statistics type
}

// Check if min/max are available
if (!int_stats->hasMinimum() || !int_stats->hasMaximum()) {
return std::nullopt; // Statistics incomplete
}

// Extract raw values
int64_t min_value = int_stats->getMinimum();
int64_t max_value = int_stats->getMaximum();
bool has_null = col_stats->hasNull();

// Sanity check: min should be <= max
if (min_value > max_value) {
return std::nullopt; // Invalid statistics
}

return MinMaxStats(min_value, max_value, has_null);
}

// Build Arrow Expression representing stripe statistics guarantee
// Returns expression: (field >= min AND field <= max) OR is_null(field)
//
// This expression describes what values COULD exist in the stripe.
// Arrow's SimplifyWithGuarantee() will use this to determine if
// a predicate could be satisfied by this stripe.
//
// Example: If stripe has min=0, max=100, the guarantee is:
// (field >= 0 AND field <= 100) OR is_null(field)
//
// Then for predicate "field > 200", SimplifyWithGuarantee returns literal(false),
// indicating the stripe can be skipped.
compute::Expression BuildMinMaxExpression(
const FieldRef& field_ref,
const std::shared_ptr<DataType>& field_type,
const Scalar& min_value,
const Scalar& max_value,
bool has_null) {

// Create field reference expression
auto field_expr = compute::field_ref(field_ref);

// Build range expression: field >= min AND field <= max
auto min_expr = compute::greater_equal(field_expr, compute::literal(min_value));
auto max_expr = compute::less_equal(field_expr, compute::literal(max_value));
auto range_expr = compute::and_(std::move(min_expr), std::move(max_expr));

// If stripe contains nulls, add null handling
// This ensures we don't skip stripes with nulls when predicate
// could match null values
if (has_null) {
auto null_expr = compute::is_null(field_expr);
return compute::or_(std::move(range_expr), std::move(null_expr));
}

return range_expr;
}

// Convenience overload that takes MinMaxStats directly
compute::Expression BuildMinMaxExpression(
const FieldRef& field_ref,
const std::shared_ptr<DataType>& field_type,
const MinMaxStats& stats) {

// Convert int64 to Arrow scalar
auto min_scalar = std::make_shared<Int64Scalar>(stats.min);
auto max_scalar = std::make_shared<Int64Scalar>(stats.max);

return BuildMinMaxExpression(field_ref, field_type,
*min_scalar, *max_scalar,
stats.has_null);
}

class ArrowInputFile : public liborc::InputStream {
public:
explicit ArrowInputFile(const std::shared_ptr<io::RandomAccessFile>& file)
Expand Down
Loading