diff --git a/.github/workflows/test-platforms.yml b/.github/workflows/test-platforms.yml index 91a4db1..976de41 100644 --- a/.github/workflows/test-platforms.yml +++ b/.github/workflows/test-platforms.yml @@ -15,10 +15,10 @@ on: - development paths: - 'libre2-core/src/**' - - 'libre2-dropwizard/src/**' + - 'perf-test/src/**' - 'pom.xml' - 'libre2-core/pom.xml' - - 'libre2-dropwizard/pom.xml' + - 'perf-test/pom.xml' - '.github/workflows/test-platforms.yml' # Manual trigger @@ -30,10 +30,10 @@ on: - development paths: - 'libre2-core/src/**' - - 'libre2-dropwizard/src/**' + - 'perf-test/src/**' - 'pom.xml' - 'libre2-core/pom.xml' - - 'libre2-dropwizard/pom.xml' + - 'perf-test/pom.xml' jobs: # ============================================================================ @@ -60,26 +60,24 @@ jobs: - name: Verify JAR contents run: | - echo "Core JAR size:" + echo "Core JARs:" ls -lh libre2-core/target/libre2-core-*.jar echo "" - echo "Cassandra JAR size:" - ls -lh libre2-dropwizard/target/libre2-dropwizard-*.jar || echo "(empty skeleton - OK)" - - echo "" - echo "Native libraries in core JAR:" - unzip -l libre2-core/target/libre2-core-*.jar | grep -E "libre2\.(dylib|so)" + echo "Native libraries in main JAR:" + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep -E "libre2\.(dylib|so)" echo "" echo "Verify JNA NOT included (should show nothing):" - unzip -l libre2-core/target/libre2-core-*.jar | grep "com/sun/jna" || echo "✅ JNA not included (provided scope working)" + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep "com/sun/jna" || echo "✅ JNA not included (provided scope working)" - name: Upload core JAR artifact uses: actions/upload-artifact@v4 with: name: libre2-core-jar - path: libre2-core/target/libre2-core-*.jar + path: | + libre2-core/target/libre2-core-1.0.0.jar + libre2-core/target/libre2-core-1.0.0-tests.jar retention-days: 7 # ============================================================================ @@ -114,7 +112,7 @@ jobs: - name: Verify native library in JAR run: | echo "Checking darwin-x86_64 library in JAR:" - unzip -l libre2-core/target/libre2-core-*.jar | grep "darwin-x86_64/libre2.dylib" + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep "darwin-x86_64/libre2.dylib" - name: Run tests (multi-module) run: mvn test -B @@ -149,7 +147,7 @@ jobs: - name: Verify native library in JAR run: | echo "Checking darwin-aarch64 library in JAR:" - unzip -l libre2-core/target/libre2-core-*.jar | grep "darwin-aarch64/libre2.dylib" + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep "darwin-aarch64/libre2.dylib" - name: Run tests with JAR run: mvn test -B @@ -208,7 +206,7 @@ jobs: # Verify native library in JAR echo 'Checking linux-x86_64 library in JAR:' - unzip -l libre2-core/target/libre2-core-*.jar | grep 'linux-x86_64/libre2.so' + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep 'linux-x86_64/libre2.so' # Run tests ./mvnw test -B @@ -241,7 +239,7 @@ jobs: - name: Verify native library in JAR run: | echo "Checking linux-x86_64 library in JAR:" - unzip -l libre2-core/target/libre2-core-*.jar | grep "linux-x86_64/libre2.so" + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep "linux-x86_64/libre2.so" - name: Run tests with JAR run: mvn test -B @@ -290,7 +288,7 @@ jobs: # Verify native library in JAR echo 'Checking linux-x86_64 library in JAR:' - unzip -l libre2-core/target/libre2-core-*.jar | grep 'linux-x86_64/libre2.so' + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep 'linux-x86_64/libre2.so' # Run tests ./mvnw test -B @@ -345,7 +343,7 @@ jobs: # Verify native library in JAR echo 'Checking linux-x86_64 library in JAR:' - unzip -l libre2-core/target/libre2-core-*.jar | grep 'linux-x86_64/libre2.so' + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep 'linux-x86_64/libre2.so' # Run tests ./mvnw test -B @@ -397,7 +395,7 @@ jobs: # Verify native library in JAR echo 'Checking linux-x86_64 library in JAR:' - unzip -l libre2-core/target/libre2-core-*.jar | grep 'linux-x86_64/libre2.so' + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep 'linux-x86_64/libre2.so' # Run tests ./mvnw test -B @@ -448,7 +446,7 @@ jobs: # Verify native library in JAR echo 'Checking linux-aarch64 library in JAR:' - unzip -l libre2-core/target/libre2-core-*.jar | grep 'linux-aarch64/libre2.so' + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep 'linux-aarch64/libre2.so' # Run tests ./mvnw test -B @@ -503,7 +501,7 @@ jobs: # Verify native library in JAR echo 'Checking linux-aarch64 library in JAR:' - unzip -l libre2-core/target/libre2-core-*.jar | grep 'linux-aarch64/libre2.so' + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep 'linux-aarch64/libre2.so' # Run tests ./mvnw test -B @@ -561,7 +559,7 @@ jobs: # Verify native library in JAR echo 'Checking linux-aarch64 library in JAR:' - unzip -l libre2-core/target/libre2-core-*.jar | grep 'linux-aarch64/libre2.so' + unzip -l libre2-core/target/libre2-core-1.0.0.jar | grep 'linux-aarch64/libre2.so' # Run tests ./mvnw test -B diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md deleted file mode 100644 index e56e5f7..0000000 --- a/ARCHITECTURE.md +++ /dev/null @@ -1,683 +0,0 @@ -# Architecture - -Internal design and implementation details of libre2-java. - ---- - -## System Overview - -```mermaid -graph TB - User[User Application] - API[RE2 API Layer
Pattern, Matcher, RE2] - Cache[Pattern Cache
LRU + Idle Eviction] - BG[Background Thread
Idle Eviction + Cleanup] - Tracker[Resource Tracker
Limits + Monitoring] - JNA[JNA Layer
RE2Native Interface] - Native[Native Library
libre2.so/dylib] - RE2[RE2 C++ Library
Google's RE2] - - User -->|compile pattern| API - API -->|check cache| Cache - Cache -->|miss: compile| JNA - JNA -->|call C function| Native - Native -->|regex ops| RE2 - - API -->|create matcher| API - API -->|track resources| Tracker - Cache -->|start| BG - BG -->|evict idle| Cache - BG -->|cleanup deferred| Cache -``` - ---- - -## Pattern Compilation Flow - -```mermaid -sequenceDiagram - participant User - participant Pattern - participant Cache - participant Tracker - participant JNA - participant Native - - User->>Pattern: compile("test") - Pattern->>Cache: getOrCompile("test") - - alt Cache Hit - Cache->>Cache: Check map - Cache-->>Pattern: Return cached - Pattern-->>User: Same instance - else Cache Miss - Cache->>Tracker: trackPatternAllocated() - Tracker->>Tracker: Check limit - alt Limit OK - Tracker-->>Cache: Proceed - Cache->>JNA: re2_compile() - JNA->>Native: Native call - Native-->>JNA: Pointer - JNA-->>Cache: Pattern object - Cache->>Cache: Add to cache - Cache-->>Pattern: New instance - Pattern-->>User: New Pattern - else Limit Exceeded - Tracker-->>Cache: ResourceException - Cache-->>User: Exception thrown - end - end -``` - ---- - -## Cache Eviction Flow - -### LRU Eviction (When Cache Full) - -```mermaid -flowchart TD - A[New pattern compiled] --> B{Cache size > max?} - B -->|No| C[Add to cache] - B -->|Yes| D[LRU: Get eldest pattern] - - D --> E{Eldest refCount == 0?} - E -->|Yes| F[forceClose - Free native memory] - E -->|No| G[Add to deferred cleanup list] - - F --> H[Remove from cache] - G --> H - H --> C -``` - -### Idle Eviction (Background Thread, Every 60s) - -```mermaid -flowchart TD - A[Background thread wakes] --> B[Scan cache for idle patterns] - B --> C{Pattern idle > timeout?} - - C -->|No| D[Keep in cache] - C -->|Yes| E{refCount == 0?} - - E -->|Yes| F[forceClose - Free memory] - E -->|No| G[Add to deferred cleanup] - - F --> H[Remove from cache] - G --> H - H --> I[Continue scan] -``` - -### Deferred Cleanup (Every 5s) - -```mermaid -flowchart TD - A[Background thread wakes
every 5 seconds] --> B[Scan deferred cleanup list] - B --> C{Pattern refCount == 0?} - - C -->|No| D[Keep in deferred list
Still in use] - C -->|Yes| E[forceClose - Free memory] - - E --> F[Remove from deferred list] - F --> G[Continue scan] - D --> G -``` - ---- - -## Reference Counting (Use-After-Free Prevention) - -```mermaid -stateDiagram-v2 - [*] --> Compiled: Pattern.compile() - Compiled --> InUse: matcher() called
refCount++ - - InUse --> InUse: More matchers
refCount++ - InUse --> Compiled: Matcher.close()
refCount-- - - Compiled --> CanFree: refCount == 0 - CanFree --> Freed: forceClose() - - InUse --> Deferred: Cache eviction
(refCount > 0) - Deferred --> Deferred: Still in use
refCount > 0 - Deferred --> CanFree: Last matcher closed
refCount == 0 - - Freed --> [*] - - note right of Deferred - Evicted from cache - but can't free yet - (matchers still active) - end note - - note right of CanFree - Safe to free - (no active matchers) - end note -``` - ---- - -## Thread Interactions - -### Multiple Threads Accessing Cache - -```mermaid -sequenceDiagram - participant T1 as Thread 1 - participant T2 as Thread 2 - participant Cache - participant Lock as synchronized(cache) - - T1->>Lock: Acquire lock - Lock-->>T1: Granted - T1->>Cache: compile("pattern1") - Cache->>Cache: Add to cache - - Note over T2: Thread 2 waiting... - - T1->>Lock: Release lock - T2->>Lock: Acquire lock - Lock-->>T2: Granted - T2->>Cache: compile("pattern1") - Cache->>Cache: Cache hit! - Cache-->>T2: Return same instance - T2->>Lock: Release lock - - Note over T1,T2: No deadlock
Single lock -``` - -### Background Thread Coordination - -```mermaid -sequenceDiagram - participant Main as Main Thread - participant BG as Background Thread - participant Cache - participant Lock as synchronized(cache) - - Main->>Lock: Acquire (compile) - Lock-->>Main: Granted - Main->>Cache: Add pattern - - Note over BG: Wakes every 5s - - BG->>Lock: Try acquire (cleanup) - Note over BG: Waits for lock... - - Main->>Lock: Release - BG->>Lock: Acquire - Lock-->>BG: Granted - BG->>Cache: Cleanup deferred - BG->>Lock: Release - - Note over Main,BG: Brief contention
No deadlock -``` - ---- - -## Memory Layout - -### JVM Heap vs Native Memory - -``` -┌─────────────────────────────────────────────┐ -│ JVM Heap Memory │ -│ │ -│ ┌──────────────────────────────────────┐ │ -│ │ Pattern objects │ │ -│ │ - Java wrapper (small) │ │ -│ │ - Pointer to native │ │ -│ │ - String fields │ │ -│ │ │ │ -│ │ ~1-2 KB each │ │ -│ └──────────────────────────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────┐ │ -│ │ PatternCache │ │ -│ │ - LinkedHashMap (50K entries) │ │ -│ │ - Statistics (AtomicLongs) │ │ -│ │ - Deferred list │ │ -│ │ │ │ -│ │ ~10-20 MB │ │ -│ └──────────────────────────────────────┘ │ -└─────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────┐ -│ Native (Off-Heap) Memory │ -│ │ -│ ┌──────────────────────────────────────┐ │ -│ │ Compiled RE2 Patterns │ │ -│ │ - DFA states │ │ -│ │ - NFA bytecode │ │ -│ │ - Metadata │ │ -│ │ │ │ -│ │ ~1-50 KB each (varies by complexity)│ │ -│ │ │ │ -│ │ 50K patterns ≈ 50-200 MB │ │ -│ └──────────────────────────────────────┘ │ -└─────────────────────────────────────────────┘ - -Total Memory: ~60-220 MB for 50K cached patterns -``` - ---- - -## Critical Design Decisions - -### 1. Why Automatic Caching? - -**Problem:** Manual caching is error-prone -- Users forget to cache -- Users cache incorrectly (memory leaks) -- Inconsistent performance - -**Solution:** Automatic caching in `compile()` -- Always returns cached pattern -- Transparent to user -- Consistent performance - -### 2. Why Reference Counting? - -**Problem:** Pattern evicted while Matcher using it = crash - -**Solution:** Track active Matchers per Pattern -- Matcher.constructor() increments refCount -- Matcher.close() decrements refCount -- forceClose() only frees if refCount == 0 - -**Result:** Safe under any concurrency - -### 3. Why Deferred Cleanup? - -**Problem:** Can't free patterns while in use, but they're evicted from cache - -**Solution:** Deferred cleanup list -- Evicted patterns with refCount > 0 go to list -- Background thread checks every 5s -- Frees when refCount reaches 0 - -**Result:** No memory leaks, even under extreme load - -### 4. Why Two Eviction Strategies? - -**LRU:** Prevents unbounded growth (size limit) -**Idle:** Prevents long-term accumulation (time limit) - -**Together:** Adaptive cache that handles both: -- Bursty workloads (LRU keeps size bounded) -- Long-running instances (idle removes forgotten patterns) - ---- - -## Performance Characteristics - -### Time Complexity - -| Operation | Complexity | Notes | -|-----------|-----------|-------| -| compile() - cache hit | O(1) | HashMap lookup | -| compile() - cache miss | O(n) | n = pattern length, RE2 compilation | -| matcher.find() | O(m) | m = input length, **always linear** | -| matcher.matches() | O(m) | m = input length, **always linear** | -| LRU eviction | O(1) | LinkedHashMap eldest removal | -| Idle eviction | O(k) | k = cache size, full scan | -| Deferred cleanup | O(d) | d = deferred list size | - -**No operation is exponential** - critical for production. - -### Space Complexity - -- **Heap:** O(cache size) for Java objects -- **Native:** O(cache size) for compiled patterns -- **Total:** ~1-4 MB per 1000 patterns - ---- - -## Concurrency Model - -**Lock Strategy:** Coarse-grained locking (single lock per cache) - -**Why not ConcurrentHashMap?** -- LinkedHashMap provides LRU in access-order mode -- ConcurrentHashMap doesn't support access-order -- Trade-off: Simpler code, proven correct, acceptable performance - -**Lock Contention:** -- Typical hold time: < 500 μs (microseconds) -- Under 1000-thread load: Tests show acceptable throughput -- Could optimize later if profiling shows bottleneck - -**No Deadlocks:** -- Only one lock in entire codebase (synchronized on cache) -- Never nested -- Verified in stress tests - ---- - -## Error Handling - -### Compilation Errors - -```java -try { - Pattern p = RE2.compile("(invalid"); -} catch (PatternCompilationException e) { - // Invalid regex syntax - String pattern = e.getPattern(); // "(invalid" - // Log and handle -} -``` - -### Resource Limit Errors - -```java -try { - Pattern p = RE2.compile("test"); -} catch (ResourceException e) { - // maxSimultaneousCompiledPatterns exceeded - // Too many active patterns - // May indicate resource leak in application -} -``` - -### Native Library Errors - -```java -try { - boolean matches = matcher.find(); -} catch (NativeLibraryException e) { - // Rare: Native RE2 call failed - // Log and investigate -} -``` - ---- - -## Future Enhancements (Not Yet Implemented) - -**Phase 3:** Timeout Support -- Per-operation timeout -- ExecutorService-based -- Timeout metrics - -**Phase 4:** Metrics Integration -- Dropwizard Metrics adapter -- Cassandra JMX integration -- Full observability - -**Phase 5:** Additional Features -- Capture groups support -- Find-all iterations -- Replace operations - ---- - -## Comparison with Other Java RE2 Bindings - -| Feature | libre2-java | Other Bindings | -|---------|-------------|----------------| -| Thread-safe | ✅ Yes | ⚠️ Varies | -| Automatic caching | ✅ Yes | ❌ No | -| Resource limits | ✅ Yes | ❌ No | -| Memory leak prevention | ✅ Yes (deferred cleanup) | ⚠️ Manual | -| All platforms | ✅ 4 platforms | ⚠️ Limited | -| Secure build | ✅ Commit pinning + sigs | ⚠️ Varies | -| Production tested | ✅ 163 tests, stress tested | ⚠️ Basic tests | -| High-concurrency optimized | ✅ Yes | ❌ No | - ---- - -## Code Organization - -``` -libre2-java/ -├── src/main/java/com/axonops/libre2/ -│ ├── api/ # Public API -│ │ ├── RE2.java # Main entry point -│ │ ├── Pattern.java # Compiled pattern (thread-safe) -│ │ ├── Matcher.java # Matching operations (per-thread) -│ │ └── *Exception.java # Exception hierarchy (sealed) -│ │ -│ ├── cache/ # Caching layer -│ │ ├── PatternCache.java # LRU cache implementation -│ │ ├── IdleEvictionTask.java # Background thread -│ │ ├── RE2Config.java # Configuration (record) -│ │ └── CacheStatistics.java # Metrics (record) -│ │ -│ ├── jni/ # JNA integration -│ │ ├── RE2Native.java # JNA interface (8 functions) -│ │ └── RE2LibraryLoader.java # Platform detection, loading -│ │ -│ └── util/ # Utilities -│ └── ResourceTracker.java # Resource counting, limits -│ -├── src/main/resources/native/ # Pre-compiled libraries -│ ├── darwin-x86_64/libre2.dylib -│ ├── darwin-aarch64/libre2.dylib -│ ├── linux-x86_64/libre2.so -│ └── linux-aarch64/libre2.so -│ -├── native/ # Native build system -│ ├── wrapper/re2_wrapper.cpp # C wrapper (8 functions) -│ ├── scripts/build.sh # Build script -│ └── Dockerfile # Linux builds -│ -└── src/test/java/ # 163 comprehensive tests -``` - ---- - -## Key Implementation Details - -### Pattern Cache (LinkedHashMap) - -**Why LinkedHashMap?** -- Access-order mode provides LRU automatically -- `removeEldestEntry()` callback for eviction -- Simpler than manual LRU tracking - -**Thread Safety:** -- All operations `synchronized(cache)` -- Single lock, no nesting -- Brief hold times (< 500 μs) - -### Reference Counting - -**Implementation:** -```java -class Pattern { - private final AtomicInteger refCount = new AtomicInteger(0); - - void incrementRefCount() { // Called by Matcher constructor - int current = refCount.incrementAndGet(); - if (current > maxMatchersPerPattern) { - refCount.decrementAndGet(); // Rollback - throw ResourceException; - } - } - - void decrementRefCount() { // Called by Matcher.close() - refCount.decrementAndGet(); - } - - public void forceClose() { - if (refCount.get() > 0) { - return; // Can't free - still in use - } - freeNativeResources(); - } -} -``` - -### Deferred Cleanup List - -**Implementation:** -```java -private final CopyOnWriteArrayList deferredCleanup; - -// Add when evicting in-use pattern: -if (pattern.getRefCount() > 0) { - deferredCleanup.add(pattern); -} - -// Cleanup (every 5s): -for (CachedPattern deferred : deferredCleanup) { - if (deferred.pattern().getRefCount() == 0) { - deferred.forceClose(); - deferredCleanup.remove(deferred); - } -} -``` - -**Why CopyOnWriteArrayList?** -- Thread-safe iteration without lock -- Modifications don't affect ongoing iterations -- Fast reads (no locks) - ---- - -## Native Integration - -### JNA Call Path - -``` -Java: matcher.find() - ↓ -JNA: RE2Native.re2_partial_match(pointer, text, length) - ↓ -C: int re2_partial_match(void* pattern, char* text, int len) - ↓ -C++: RE2::PartialMatch(text, *pattern) - ↓ -RE2: DFA/NFA execution (linear time) - ↓ -Return: 1 (match), 0 (no match), -1 (error) -``` - -### Library Loading - -```mermaid -flowchart TD - A[First Pattern.compile call] --> B[RE2LibraryLoader.loadLibrary] - B --> C{Already loaded?} - - C -->|Yes| D[Return cached library] - C -->|No| E[Detect platform
OS + Architecture] - - E --> F[Determine resource path
/native/darwin-aarch64/libre2.dylib] - F --> G[Extract from JAR to /tmp] - G --> H[Load via JNA Native.load] - - H --> I{Success?} - I -->|Yes| J[Cache library instance] - I -->|No| K[Throw exception] - - J --> L[Return RE2Native interface] -``` - -### Platform Detection - -```java -OS: System.getProperty("os.name") - - "Mac OS X" → MACOS - - "Linux" → LINUX - -Architecture: System.getProperty("os.arch") - - "amd64", "x86_64" → X86_64 - - "aarch64", "arm64" → AARCH64 - -Resource path: - MACOS + X86_64 → /native/darwin-x86_64/libre2.dylib - MACOS + AARCH64 → /native/darwin-aarch64/libre2.dylib - LINUX + X86_64 → /native/linux-x86_64/libre2.so - LINUX + AARCH64 → /native/linux-aarch64/libre2.so -``` - ---- - -## Testing Strategy - -### Test Categories - -1. **Core API (89 tests):** Pattern/Matcher functionality -2. **Configuration (14 tests):** Validation, builder, defaults -3. **Caching (12 tests):** Hits, misses, LRU, statistics -4. **Concurrency (7 tests):** 100+ thread compilation/matching -5. **Eviction Safety (6 tests):** Use-after-free prevention -6. **Edge Cases (6 tests):** Boundary conditions -7. **Resource Limits (5 tests):** Limit enforcement -8. **Thread Safety (5 tests):** Deadlocks, race conditions -9. **Idle Eviction (5 tests):** Background eviction -10. **Stress (4 tests):** Sustained load, 60K+ patterns -11. **Cleanup (4 tests):** Concurrent close operations -12. **Cache Full (3 tests):** Critical memory leak scenario -13. **Timing (3 tests):** Deferred cleanup frequency - -**Total:** 163 tests covering all critical paths - ---- - -## Dependencies - -### Compile-Time - -- **JNA 5.13.0:** Java Native Access (provided scope) -- **SLF4J 2.0.9:** Logging (provided scope) - -### Native Libraries (Embedded) - -- **RE2 (2025-11-05):** Regex engine -- **Abseil (20250814.1):** C++ utilities (statically linked) - -### Build-Time Only - -- **JUnit 5:** Testing -- **AssertJ:** Assertions -- **Logback:** Test logging - -**Production JAR:** 2.4 MB (just our code + native libraries) - ---- - -## Security Considerations - -### Native Library Provenance - -1. **Source:** Git commits from google/re2 and abseil/abseil-cpp -2. **Commits:** Pinned in GitHub environment variables (admin-only) -3. **Verification:** Signature checked via GitHub API -4. **Build:** Automated GitHub Actions (reproducible) -5. **Result:** Cryptographically verifiable binaries - -### Supply Chain Security - -- **No tarball downloads:** Git clone from known commits -- **No package managers:** Build from source -- **No external binaries:** Compile everything -- **Signed commits:** Google engineer signatures verified -- **Audit trail:** All builds logged in GitHub Actions - ---- - -## Limitations - -### Current - -1. **Single global cache:** One configuration for entire JVM - - Workaround: Adjust before first use - - Fix: Phase 3 will add per-instance config - -2. **Windows not supported:** Intentionally omitted (complexity) - - Focus: Production Linux/macOS deployments - -3. **No capture groups yet:** Only match/find operations - - Coming: Phase 5 - -### By Design - -1. **Matcher not thread-safe:** Intentional (matches Java Pattern/Matcher design) -2. **Soft resource limits:** Can exceed by concurrent thread count (acceptable) -3. **Coarse-grained locking:** Single cache lock (acceptable performance) - -**None of these affect production deployment.** diff --git a/CRITICAL_REVIEW.md b/CRITICAL_REVIEW.md deleted file mode 100644 index bbe4891..0000000 --- a/CRITICAL_REVIEW.md +++ /dev/null @@ -1,344 +0,0 @@ -# Critical Review - libre2-java Architecture and Safety - -**Review Date:** 2025-11-25 -**Current State:** Token 580k / 1M (58%) -**Reviewer:** Self-assessment at user request - ---- - -## 1. RE2.java - Severely Incomplete ❌ - -**Current State:** -```java -public final class RE2 { - public static Pattern compile(String pattern) - public static boolean matches(String pattern, String input) -} -``` - -**ONLY 2 methods!** This is supposed to be the main entry point. - -**Missing Methods (~25):** - -**String API:** -- find(String pattern, String input) -- match(String pattern, String input) → MatchResult -- findAll(String pattern, String input) → List -- replaceFirst(String pattern, String input, String repl) -- replaceAll(String pattern, String input, String repl) - -**ByteBuffer API:** -- matches(String pattern, ByteBuffer input) -- find(String pattern, ByteBuffer input) -- matchWithGroups(String pattern, ByteBuffer input) → MatchResult -- findWithGroups(String pattern, ByteBuffer input) → MatchResult -- findAllWithGroups(String pattern, ByteBuffer input) → List -- replaceFirst(String pattern, ByteBuffer input, String repl) -- replaceAll(String pattern, ByteBuffer input, String repl) - -**Bulk API:** -- matchAll(String pattern, String[] inputs) -- findAll(String pattern, String[] inputs) -- filter(String pattern, Collection inputs) -- replaceAll(String pattern, String[] inputs, String repl) - -**Collection API:** -- matchAll(String pattern, Collection inputs) -- filter(String pattern, Collection inputs) - -**Impact:** Users must use Pattern directly, RE2.java is useless for convenience. - -**Estimated Work:** ~60k tokens - ---- - -## 2. Pattern.java Size - Needs Refactoring? ⚠️ - -**Current State:** -- ~2,400 lines -- Contains: matching, bulk matching, capture groups, replace, ByteBuffer handling, zero-copy, filtering, map operations - -**Concern:** Single class doing too much? - -**Option A: Keep as-is** -- Pro: All Pattern operations in one place -- Pro: Easy to find methods -- Con: Large file - -**Option B: Split into separate classes** -```java -// Pattern.java - core + delegation -public final class Pattern { - private final PatternMatcher matcher; - private final PatternReplacer replacer; - private final PatternCapture capture; - - public boolean matches(String) { return matcher.matches(...); } - public MatchResult match(String) { return capture.match(...); } - public String replaceFirst(...) { return replacer.replaceFirst(...); } -} - -// PatternMatcher.java - all matching operations -// PatternCapture.java - all capture group operations -// PatternReplacer.java - all replace operations -``` - -**Recommendation:** Keep as-is for now. 2,400 lines is manageable, splitting adds complexity. - ---- - -## 3. AutoCloseable and Resource Management - CRITICAL ⚠️ - -### Pattern - ✅ Correct (implements AutoCloseable) -```java -public final class Pattern implements AutoCloseable { - private final AtomicBoolean closed; - private final AtomicInteger refCount; - - @Override - public void close() { - // Proper cleanup with reference counting - } -} -``` -**Status:** ✅ CORRECT - ref counting prevents use-after-free - -### Matcher - ✅ Correct (implements AutoCloseable) -```java -public final class Matcher implements AutoCloseable { - @Override - public void close() { - pattern.decrementRefCount(); // Release Pattern reference - tracker.trackMatcherFreed(metrics); - } -} -``` -**Status:** ✅ CORRECT - decrements Pattern refCount, tracks metrics - -### MatchResult - ❌ Does NOT implement AutoCloseable - -**Current State:** -```java -public final class MatchResult { - private final String[] groups; // Just Strings - private final Map namedGroups; // Just a Map - - // NO close() method - // NO AutoCloseable -} -``` - -**Analysis: Is this correct?** - -**MatchResult holds:** -- String[] - GC-managed, no native resources -- Map - GC-managed, no native resources -- NO Pattern reference (doesn't increment refCount) -- NO native handles - -**Conclusion:** ✅ CORRECT - MatchResult is a simple immutable value object - - with no native resources or Pattern references. It does NOT need AutoCloseable. - -**Memory leak risk:** NONE - MatchResult is just Strings - ---- - -## 4. Memory Leak Safety Audit 🔍 - -### Potential Leak Vectors - -**1. Pattern not closed** ✅ MITIGATED -- Pattern is cached by default (managed by PatternCache) -- Users shouldn't call close() on cached patterns -- Dual eviction (LRU + idle) prevents unbounded growth - -**2. Matcher not closed** ⚠️ RISK -- Users MUST close Matcher (try-with-resources) -- If not closed: Pattern refCount stays high, prevents eviction -- **Risk:** Matchers left open → Patterns never freed → memory leak - -**Mitigation:** Documentation emphasizes try-with-resources - -**3. ByteBuffer not released** ⚠️ RISK (User responsibility) -- We accept ByteBuffer/address from user -- User must ensure memory valid during call -- User must release after call -- **Risk:** User forgets to release DirectByteBuffer - -**Mitigation:** JavaDoc clearly states memory safety requirements - -**4. MatchResult accumulation** ✅ NO RISK -- No native resources -- Just Strings (GC-managed) - -**Overall Assessment:** ⚠️ MODERATE RISK -- Main risk: Users not closing Matchers -- Secondary: User ByteBuffer management (their responsibility) - ---- - -## 5. Metrics Test Coverage - Insufficient ❌ - -**Current Test (ComprehensiveMetricsTest.java):** -- ~20 tests -- Tests String, Bulk, Zero-Copy variants -- Tests global = sum of specifics - -**What's Missing:** - -**A. Not testing ALL operations:** -- ❌ No tests for filter/filterNot metrics -- ❌ No tests for map filtering metrics (filterByKey, etc.) -- ❌ No tests for retainMatches/removeMatches metrics -- ❌ No tests for ByteBuffer[] bulk metrics -- ❌ No tests for findAll(String[]) metrics -- ❌ No tests for matchWithGroups/findWithGroups/findAllWithGroups metrics - -**B. Not testing ALL metric types:** -- ❌ MATCHING_FULL_MATCH_LATENCY vs MATCHING_PARTIAL_MATCH_LATENCY split -- ❌ CAPTURE_FINDALL_MATCHES counting -- ❌ REPLACE_BULK_ITEMS vs REPLACE_BULK_OPERATIONS - -**C. Not testing edge cases:** -- ❌ Empty arrays (should not record items) -- ❌ Failed operations (should still record operation count) -- ❌ Multiple patterns (metrics should be global across all patterns) - -**Estimated Missing Tests:** ~40 more test methods needed - ---- - -## 6. Test Organization - Poor Structure ❌ - -**Current Test Files:** -``` -api/BulkMatchingTest.java - 47 tests (Phase 1 String bulk) -api/BulkMatchingPerformanceTest.java - 3 tests -api/BulkMatchingTypeSafetyTest.java - 13 tests -api/ByteBufferApiTest.java - 23 tests (Single ByteBuffer only) -api/CaptureGroupsTest.java - 35 tests (Phase 2 String only) -api/ReplaceOperationsTest.java - 26 tests (Phase 3 String only) -api/Phase1ExtensionsTest.java - 16 tests (findAll bulk, ByteBuffer[] bulk) -metrics/ComprehensiveMetricsTest.java - 20 tests (partial coverage) -``` - -**Problems:** - -**A. Fragmented Coverage:** -- ByteBuffer tests split across ByteBufferApiTest + Phase1ExtensionsTest -- Capture groups missing zero-copy tests -- Replace missing zero-copy tests -- No tests for *WithGroups methods - -**B. No Clear Pattern:** -- Some tests by feature (BulkMatching, CaptureGroups, Replace) -- Some tests by API type (ByteBufferApi) -- Some tests by phase (Phase1Extensions) -- **Inconsistent organization** - -**C. Missing Integration Tests:** -- No test combining capture + replace -- No test combining bulk + zero-copy -- No end-to-end Cassandra scenario test - -**Proposed Reorganization:** - -``` -api/ -├── MatchingTest.java - ALL matching (String, ByteBuffer, address, bulk) -├── CaptureGroupsTest.java - ALL capture (String, ByteBuffer, address, *WithGroups) -├── ReplaceTest.java - ALL replace (String, ByteBuffer, address, bulk) -├── FilteringTest.java - ALL filter/map/retain operations -├── IntegrationTest.java - Cross-feature scenarios -└── PerformanceTest.java - Benchmarks - -metrics/ -├── MetricsInstrumentationTest.java - Verify ALL methods record metrics -└── MetricsAggregationTest.java - Verify global = sum of specifics -``` - -**Estimated Refactoring:** ~40k tokens - ---- - -## 7. Critical Gaps Summary - -| Area | Status | Severity | Est. Tokens | -|------|--------|----------|-------------| -| RE2.java empty | ❌ | HIGH | 60k | -| Phase 3 zero-copy missing | ⚠️ | HIGH | 40k | -| Metrics test incomplete | ❌ | HIGH | 80k | -| MatchResult AutoCloseable | ✅ | N/A | 0k (correct as-is) | -| Pattern.java size | ⚠️ | LOW | 0k (keep as-is) | -| Test organization | ❌ | MEDIUM | 40k | -| Missing zero-copy tests | ❌ | MEDIUM | 60k | -| Bulk capture ops | ❌ | LOW | 30k | - -**Total Critical Path:** ~180k tokens -**Available:** 420k tokens ✅ - ---- - -## 8. Recommended Action Plan - -### Immediate (Critical): - -**1. Add Phase 3 Zero-Copy Replace** (40k) -- 6 methods with metrics -- Tests pass - -**2. Populate RE2.java** (60k) -- Add ALL convenience methods -- Mirror Pattern API - -**3. Complete Metrics Test** (80k) -- Test EVERY method records metrics -- Test EVERY metric constant is used -- Test global = sum of specifics for ALL operation types - -### Medium Priority: - -**4. Add Zero-Copy Tests** (60k) -- *WithGroups methods -- ByteBuffer[] bulk -- Replace zero-copy - -**5. Add Bulk Capture** (30k) -- MatchResult[] matchAll(String[]) -- With metrics - -### Low Priority: - -**6. Test Reorganization** (40k) -- Group by feature, not by API type -- Add integration tests - ---- - -## Token Budget - -**Used:** 580k / 1M (58%) -**Remaining:** 420k -**Critical Path:** 180k -**Buffer:** 240k ✅ - ---- - -## Recommendation - -**FOCUS ON CRITICAL PATH:** -1. Phase 3 zero-copy (40k) -2. RE2.java (60k) -3. Comprehensive metrics test (80k) - -**DEFER:** -- Test reorganization (works, just messy) -- Bulk capture (low priority) -- Pattern refactoring (not needed) - -**This gets library to production-ready state within token budget.** - ---- - -**Awaiting your decision on priorities.** diff --git a/FINAL_STATUS.md b/FINAL_STATUS.md deleted file mode 100644 index b57ae4d..0000000 --- a/FINAL_STATUS.md +++ /dev/null @@ -1,306 +0,0 @@ -# libre2-java v1.0.0 - Final Status - -**Date:** 2025-11-25 -**Version:** 1.0.0 -**Status:** ✅ **RELEASED** -**Token Usage:** 486k / 1M (49%) - ---- - -## Release Complete ✅ - -**GitHub Release:** v1.0.0 tagged and pushed -**Branch Structure:** Clean - single initial commit -**Development Branch:** Deleted (now work from feature branches off main) -**Tests:** 459/459 passing -**Build:** SUCCESS - ---- - -## Repository State - -### Commit History -``` -8585536 Initial commit - libre2-java v1.0.0 -``` - -**Clean slate:** All development history squashed into single commit -**Tagged:** v1.0.0 -**Branch:** main only (development deleted) - -### Future Workflow -- Work from **main** branch -- Create feature branches for new work -- Squash merge features into main -- Clean, linear history going forward - ---- - -## What Was Accomplished This Session - -### 1. Feature Completion (Phases 0-5) ✅ -- **Added 272 tests** (187 → 459) -- **Added 30 metrics** (25 → 55) -- **Added 50+ API methods** -- **Implemented all gaps** from RE2_GAP_IMPLEMENTATION.md - -### 2. Code Quality ✅ -- MatchResult AutoCloseable pattern -- All tests updated for try-with-resources -- Comprehensive Javadoc (100% coverage) -- Zero compiler errors - -### 3. Documentation ✅ -- **CHANGELOG.md** - Complete 1.0.0 release notes -- **QUICKSTART.md** - Rewritten with all features (776 lines) -- **RELEASE_READY.md** - Release checklist and status -- **IMPLEMENTATION_COMPLETE.md** - Feature audit -- **JNI_OPTIMIZATION_CONCLUSION.md** - Learning from optimization attempt - -### 4. Repository Cleanup ✅ -- Removed 7 temporary session documents -- Consolidated planning documents in docs/archive/ -- Clean, professional repository structure - -### 5. Release Preparation ✅ -- Version bumped: 0.9.1 → 1.0.0 (all pom.xml files) -- Tagged v1.0.0 -- Commit history reset (single initial commit) -- Development branch deleted - -### 6. JNI Optimization Investigation ✅ -- Attempted GetByteArrayRegion optimization -- Measured: No performance improvement -- Root cause: String→byte[] conversion overhead -- Decision: Keep existing implementation -- Documented findings for future reference - ---- - -## Version 1.0.0 Highlights - -### APIs -- **Pattern.java:** 80+ methods -- **RE2.java:** 28 static convenience methods -- **MatchResult:** 9 methods (AutoCloseable) -- **JNI Layer:** 29 functions (all tested, documented) - -### Features -- **Bulk operations:** 16 methods (10-20x faster) -- **Capture groups:** Full support with named groups -- **Replace operations:** 6 methods with backreferences -- **Zero-copy:** 15 methods (DirectByteBuffer + address) -- **Utilities:** quoteMeta, programFanout, programSize -- **Metrics:** 55 total (global + specific breakdowns) - -### Quality -- **Tests:** 459 passing (0 failures, 0 errors) -- **Documentation:** 100% public API coverage -- **Platforms:** macOS (x86_64, ARM64), Linux (x86_64, ARM64) -- **Performance:** 3.6M matches/sec (bulk), linear-time complexity - -### Documentation -- Comprehensive QUICKSTART.md with all features -- Migration guide from java.util.regex -- Real-world examples (log parsing, PII redaction, validation) -- Complete CHANGELOG -- Best practices and performance tips - ---- - -## File Structure - -### Documentation (Root) -- **CHANGELOG.md** - Release notes -- **QUICKSTART.md** - Quick start guide (776 lines) -- **README.md** - Project overview -- **ARCHITECTURE.md** - System design -- **CONFIGURATION.md** - Tuning guide -- **LOGGING_GUIDE.md** - Logging setup -- **RELEASE_READY.md** - Release checklist -- **IMPLEMENTATION_COMPLETE.md** - Feature audit -- **RE2_GAP_PROGRESS.md** - Feature completion tracking -- **RE2_GAP_IMPLEMENTATION.md** - Planning document -- **JNI_OPTIMIZATION_CONCLUSION.md** - Optimization learnings -- **RE2_LINEAR_GUARANTEE.md** - Performance guarantees -- **ZERO_COPY_IMPLEMENTATION.md** - Technical reference - -### Source Code -- **libre2-core/** - Core library (29 JNI functions, 80+ methods) -- **libre2-dropwizard/** - Metrics integration -- **native/** - JNI wrapper and build scripts - -### Tests (459 total) -- Core API tests: 106 -- JNI layer tests: 48 -- Bulk matching tests: 47 -- Capture groups tests: 35 -- Replace operation tests: 26 -- ByteBuffer API tests: 23 -- Cache tests: 100+ -- Metrics tests: 27 -- Performance tests: 7 - ---- - -## Production Readiness - -### Cassandra 5.0+ Integration -- ✅ Dropwizard Metrics integration -- ✅ JMX exposure -- ✅ SLF4J logging -- ✅ Off-heap execution (no OOM risk) -- ✅ Thread-safe operations -- ✅ ReDoS safe (linear-time) - -### Performance Validated -- Bulk operations: 3.6M matches/sec -- Zero-copy ByteBuffer: 46-99% faster -- Pattern caching: ~50ns lookup -- Cache hit rate: >90% steady state - -### Quality Assurance -- 459 comprehensive tests -- Zero failures, zero errors -- All platforms verified -- Memory leak tested -- Concurrency tested -- Resource limits tested - ---- - -## Next Steps for Users - -### Installation -```xml - - com.axonops - libre2-core - 1.0.0 - -``` - -### Getting Started -1. Read QUICKSTART.md -2. Try examples in QUICKSTART -3. Configure for your environment -4. Monitor metrics -5. Profile in production - -### For Cassandra Users -1. Add dependency to Cassandra -2. Configure with `RE2MetricsConfig.forCassandra()` -3. Use in SAI index implementations -4. Monitor via JMX - ---- - -## Known Limitations - -### RE2 Feature Limitations (By Design) -- No lookahead/lookbehind (ensures linear-time) -- No backreferences in patterns (only in replacements) -- No possessive quantifiers -- No atomic groups - -**These are intentional** - They guarantee O(n) complexity and ReDoS safety. - -### Performance -- GetStringUTFChars is optimal for String inputs (byte[] conversion adds overhead) -- Bulk APIs provide real gains (10-20x) -- DirectByteBuffer zero-copy provides massive gains for off-heap data - ---- - -## Token Usage Breakdown - -**Total This Session:** 486k / 1M (49%) - -**Major Activities:** -1. Fix MatchResult test failures (35 tests) - 20k tokens -2. Complete Phases 1/2/3 with metrics - 180k tokens -3. Populate RE2.java (28 methods) - 45k tokens -4. Add utilities and zero-copy JNI tests - 30k tokens -5. JNI optimization attempt and revert - 120k tokens -6. Documentation and release prep - 50k tokens -7. Clean history and final polish - 20k tokens - -**Remaining:** 514k tokens (51%) - ---- - -## Future Work (Post-Release) - -### Potential Enhancements -1. JMH micro-benchmarks -2. Additional integration tests -3. Performance profiling in Cassandra -4. More real-world examples -5. Video tutorials -6. Blog post announcement - -### Maven Central Deployment -1. Configure GPG signing -2. Set up Maven Central credentials -3. Deploy to staging -4. Release to public - -### Community -1. Announce on GitHub -2. Share in Cassandra community -3. Write blog post -4. Create demo applications - ---- - -## Success Criteria - All Met ✅ - -### Functional -- [x] All planned features implemented -- [x] Zero-copy support throughout -- [x] Bulk operations for efficiency -- [x] Full capture group support -- [x] Replace operations with backreferences - -### Quality -- [x] 459 tests passing -- [x] 100% public API documentation -- [x] Clean build on all platforms -- [x] Zero memory leaks verified - -### Performance -- [x] Bulk operations 10-20x faster -- [x] Zero-copy 46-99% faster for large buffers -- [x] Linear-time complexity maintained - -### Documentation -- [x] CHANGELOG.md complete -- [x] QUICKSTART.md comprehensive -- [x] Migration guide included -- [x] Real-world examples provided - -### Release -- [x] Version bumped to 1.0.0 -- [x] Tagged v1.0.0 -- [x] Clean commit history -- [x] Development branch deleted - ---- - -## Final Summary - -**libre2-java 1.0.0 is released and production-ready.** - -The library provides: -- Full feature parity with RE2 -- Comprehensive testing (459 tests) -- Complete documentation -- Production-grade quality -- Backward compatibility with 0.9.1 -- Clean repository with single-commit history - -**Repository is now in excellent state for future feature development.** - ---- - -**End of Session Report** diff --git a/IMPLEMENTATION_COMPLETE.md b/IMPLEMENTATION_COMPLETE.md deleted file mode 100644 index f8b48b0..0000000 --- a/IMPLEMENTATION_COMPLETE.md +++ /dev/null @@ -1,360 +0,0 @@ -# Implementation Complete - libre2-java Feature Audit - -**Date:** 2025-11-25 -**Branch:** `development` -**Status:** ✅ **ALL FEATURES COMPLETE** -**Tests:** 459/459 passing ✅ -**Build:** SUCCESS ✅ - ---- - -## Feature Completeness Verification - -### Phase 0: Native Foundation ✅ - -**JNI Methods Implemented: 29/29** - -| Category | Method | Tested | Documented | -|----------|--------|--------|------------| -| Compilation | compile | ✅ | ✅ | -| Lifecycle | freePattern | ✅ | ✅ | -| Matching | fullMatch | ✅ | ✅ | -| Matching | partialMatch | ✅ | ✅ | -| Matching | fullMatchBulk | ✅ | ✅ | -| Matching | partialMatchBulk | ✅ | ✅ | -| Zero-Copy | fullMatchDirect | ✅ | ✅ | -| Zero-Copy | partialMatchDirect | ✅ | ✅ | -| Zero-Copy | fullMatchDirectBulk | ✅ | ✅ | -| Zero-Copy | partialMatchDirectBulk | ✅ | ✅ | -| Capture | extractGroups | ✅ | ✅ | -| Capture | extractGroupsBulk | ✅ | ✅ | -| Capture | extractGroupsDirect | ✅ | ✅ | -| Capture | findAllMatches | ✅ | ✅ | -| Capture | findAllMatchesDirect | ✅ | ✅ | -| Capture | getNamedGroups | ✅ | ✅ | -| Replace | replaceFirst | ✅ | ✅ | -| Replace | replaceAll | ✅ | ✅ | -| Replace | replaceAllBulk | ✅ | ✅ | -| Replace Zero-Copy | replaceFirstDirect | ✅ | ✅ | -| Replace Zero-Copy | replaceAllDirect | ✅ | ✅ | -| Replace Zero-Copy | replaceAllDirectBulk | ✅ | ✅ | -| Utilities | quoteMeta | ✅ | ✅ | -| Utilities | programFanout | ✅ | ✅ | -| Info | patternMemory | ✅ | ✅ | -| Info | getPattern | ✅ | ✅ | -| Info | numCapturingGroups | ✅ | ✅ | -| Info | patternOk | ✅ | ✅ | -| Error | getError | ✅ | ✅ | - -**RE2NativeJNITest Coverage: 48 tests** - ---- - -### Phase 1: Bulk Matching API ✅ - -**Pattern.java Methods:** - -| Method | Tested | In RE2.java | Metrics | -|--------|--------|-------------|---------| -| matchAll(String[]) | ✅ | ✅ | ✅ | -| matchAll(Collection) | ✅ | ✅ | ✅ | -| findAll(String[]) | ✅ | ✅ | ✅ | -| findAll(Collection) | ✅ | ✅ | ✅ | -| filter(Collection) | ✅ | ✅ | ✅ | -| filterNot(Collection) | ✅ | ✅ | ✅ | -| filterByKey(Map) | ✅ | ❌ | ✅ | -| filterByValue(Map) | ✅ | ❌ | ✅ | -| filterNotByKey(Map) | ✅ | ❌ | ✅ | -| filterNotByValue(Map) | ✅ | ❌ | ✅ | -| retainMatches(Collection) | ✅ | ❌ | ✅ | -| removeMatches(Collection) | ✅ | ❌ | ✅ | -| retainMatchesByKey(Map) | ✅ | ❌ | ✅ | -| retainMatchesByValue(Map) | ✅ | ❌ | ✅ | -| removeMatchesByKey(Map) | ✅ | ❌ | ✅ | -| removeMatchesByValue(Map) | ✅ | ❌ | ✅ | - -**Test Coverage:** -- BulkMatchingTest: 47 tests -- BulkMatchingPerformanceTest: 3 tests -- BulkMatchingTypeSafetyTest: 13 tests -- Phase1ExtensionsTest: 15 tests - -**Note:** Map filtering methods intentionally NOT in RE2.java (requires passing Pattern instance) - ---- - -### Phase 2: Capture Groups ✅ - -**MatchResult Class:** ✅ Implements AutoCloseable - -| Method | Tested | Documented | -|--------|--------|------------| -| matched() | ✅ | ✅ | -| group() | ✅ | ✅ | -| group(int) | ✅ | ✅ | -| group(String) | ✅ | ✅ | -| groupCount() | ✅ | ✅ | -| input() | ✅ | ✅ | -| groups() | ✅ | ✅ | -| namedGroups() | ✅ | ✅ | -| close() | ✅ | ✅ | - -**Pattern.java Capture Methods:** - -| Method | Variants | Tested | In RE2.java | Metrics | -|--------|----------|--------|-------------|---------| -| match | String, ByteBuffer, address | ✅ | ✅ | ✅ | -| find | String, ByteBuffer, address | ✅ | ❌ | ✅ | -| findAll | String, ByteBuffer, address | ✅ | ✅ | ✅ | -| matchWithGroups | ByteBuffer, address | ✅ | ✅ | ✅ | -| findWithGroups | ByteBuffer, address | ✅ | ✅ | ✅ | -| findAllWithGroups | ByteBuffer, address | ✅ | ✅ | ✅ | -| matchAllWithGroups | String[], Collection | ✅ | ✅ | ✅ | - -**Test Coverage:** -- CaptureGroupsTest: 35 tests (all using try-with-resources) -- ComprehensiveMetricsTest: Capture metrics verified - ---- - -### Phase 3: Replace Operations ✅ - -**Pattern.java Replace Methods:** - -| Method | Variants | Tested | In RE2.java | Metrics | -|--------|----------|--------|-------------|---------| -| replaceFirst | String, ByteBuffer, address | ✅ | ✅ | ✅ | -| replaceAll | String, ByteBuffer, address, String[], Collection, ByteBuffer[], address[] | ✅ | ✅ | ✅ | - -**Test Coverage:** -- ReplaceOperationsTest: 26 tests -- ComprehensiveMetricsTest: Replace metrics verified - -**Features:** -- ✅ Backreference support (\\1, \\2, etc.) -- ✅ Bulk operations (array, collection) -- ✅ Zero-copy variants (address, DirectByteBuffer) - ---- - -### Phase 4: Utilities ✅ - -**Pattern.java Utilities:** - -| Method | Type | Tested | In RE2.java | Documented | -|--------|------|--------|-------------|------------| -| quoteMeta | static | ✅ | ✅ | ✅ | -| getProgramFanout | instance | ✅ | ✅ | ✅ | -| getNativeMemoryBytes | instance | ✅ | ✅ | ✅ | - -**RE2.java Convenience Wrappers:** -- quoteMeta(String) ✅ -- getProgramFanout(String) ✅ -- getProgramSize(String) ✅ - -**Test Coverage:** -- RE2NativeJNITest: quoteMeta (3 tests), programFanout (1 test), patternMemory (1 test) - ---- - -### Phase 5: Integration & Polish ✅ - -**Metrics Instrumentation: 55 metrics** - -| Category | Metrics Count | Tested | -|----------|---------------|--------| -| Matching | 9 | ✅ | -| Capture | 10 | ✅ | -| Replace | 11 | ✅ | -| Cache | 25 | ✅ | - -**Test Coverage:** -- ComprehensiveMetricsTest: 9 tests verifying metrics -- MetricsIntegrationTest: 9 tests -- NativeMemoryMetricsTest: 5 tests - ---- - -## RE2.java Completeness Audit - -**Total Methods: 28** - -### Compilation (2) -- ✅ compile(String) -- ✅ compile(String, boolean) - -### Matching (4) -- ✅ matches(String, String) -- ✅ matches(String, ByteBuffer) -- ✅ matchAll(String, String[]) -- ✅ matchAll(String, Collection) - -### Capture Groups (6) -- ✅ match(String, String) -- ✅ findFirst(String, String) -- ✅ findAll(String, String) -- ✅ matchWithGroups(String, ByteBuffer) -- ✅ findWithGroups(String, ByteBuffer) -- ✅ findAllWithGroups(String, ByteBuffer) - -### Bulk Capture (2) -- ✅ matchAllWithGroups(String, String[]) -- ✅ matchAllWithGroups(String, Collection) - -### Filtering (3) -- ✅ filter(String, Collection) -- ✅ filterNot(String, Collection) -- ✅ findAll(String, String[]) - -### Replace Operations (5) -- ✅ replaceFirst(String, String, String) -- ✅ replaceAll(String, String, String) -- ✅ replaceAll(String, String[], String) -- ✅ replaceAll(String, Collection, String) - -### Utilities (3) -- ✅ quoteMeta(String) -- ✅ getProgramFanout(String) -- ✅ getProgramSize(String) - -**Missing from RE2.java (intentionally):** -- Map filtering methods (require Pattern instance, can't be static) -- In-place mutation methods (retainMatches, removeMatches - not suitable for static API) - ---- - -## Test Summary - -**Total: 459 tests, 0 failures, 0 errors ✅** - -### By Module -- libre2-core: 441 tests -- libre2-dropwizard: 18 tests - -### By Category -- RE2Test: 106 tests (main API) -- RE2NativeJNITest: 48 tests (JNI layer) ✅ **+8 zero-copy tests** -- BulkMatchingTest: 47 tests -- CaptureGroupsTest: 35 tests -- ReplaceOperationsTest: 26 tests -- Phase1ExtensionsTest: 15 tests -- BulkMatchingTypeSafetyTest: 13 tests -- ComprehensiveMetricsTest: 9 tests ✅ **New** -- MetricsIntegrationTest: 9 tests -- ByteBufferApiTest: 23 tests -- Cache tests: 100+ tests -- Metrics tests: 18 tests -- Performance tests: 3 tests - ---- - -## Documentation Completeness - -### RE2NativeJNI.java (29 methods) -- ✅ All 29 methods have Javadoc -- ✅ All parameters documented with @param -- ✅ All return values documented with @return -- ✅ Memory safety warnings for zero-copy methods -- ✅ Examples for complex operations - -### Pattern.java (80+ methods) -- ✅ All public methods have comprehensive Javadoc -- ✅ Usage examples for all new features -- ✅ @since tags for version tracking -- ✅ Exception documentation (@throws) - -### RE2.java (28 methods) -- ✅ All static convenience methods documented -- ✅ Usage examples in Javadoc -- ✅ Cross-references to Pattern.java for details - -### MatchResult.java -- ✅ Class-level documentation with try-with-resources examples -- ✅ All 9 public methods documented -- ✅ AutoCloseable pattern explained - ---- - -## Gaps Resolved - -### From RE2_GAP_IMPLEMENTATION.md - -**Missing Bulk Operations:** ✅ COMPLETE -- ✅ boolean[] matches(Collection) - implemented -- ✅ List filter(Collection) - implemented -- ✅ Map variants for key/value filtering - implemented -- ✅ In-place filtering (retainMatches/removeMatches) - implemented - -**Missing Capture Groups:** ✅ COMPLETE -- ✅ MatchResult class - implemented with AutoCloseable -- ✅ match(String), find(String), findAll(String) - implemented -- ✅ Named group support - implemented -- ✅ Batch capture (matchAllWithGroups) - implemented - -**Missing Replace Operations:** ✅ COMPLETE -- ✅ replaceFirst/replaceAll - implemented -- ✅ Backreference support (\\1, \\2) - implemented -- ✅ Batch variants - implemented -- ✅ Zero-copy variants - implemented - -**Missing Utilities:** ✅ COMPLETE -- ✅ quoteMeta - implemented in Pattern and RE2 -- ✅ programSize/programFanout - implemented - -**Beyond Original Plan (Added):** -- ✅ Zero-copy support (9 additional JNI methods) -- ✅ ByteBuffer API throughout (auto-routing) -- ✅ Comprehensive metrics (55 total) -- ✅ Bulk capture operations -- ✅ RE2.java convenience layer (28 methods) - ---- - -## Production Readiness Checklist - -### Functionality -- ✅ All planned features implemented -- ✅ Zero-copy support for performance -- ✅ Bulk operations for efficiency -- ✅ Full capture group support - -### Safety -- ✅ MatchResult AutoCloseable pattern -- ✅ Pattern/Matcher AutoCloseable -- ✅ Reference counting prevents use-after-free -- ✅ All resources properly tracked - -### Testing -- ✅ 459 tests passing (0 failures, 0 errors) -- ✅ All 29 JNI methods tested -- ✅ Zero-copy operations tested -- ✅ Metrics verification complete - -### Observability -- ✅ 55 metrics fully instrumented -- ✅ Global + Specific breakdown -- ✅ Metrics tested (ComprehensiveMetricsTest) - -### Documentation -- ✅ RE2NativeJNI.java: All 29 methods documented -- ✅ Pattern.java: All 80+ methods documented -- ✅ RE2.java: All 28 methods documented -- ✅ MatchResult.java: Full documentation -- ✅ Usage examples throughout - -### Build -- ✅ Clean build on all platforms -- ✅ Native libraries: macOS (x86_64, ARM64), Linux (x86_64, ARM64) -- ✅ Zero compilation errors -- ✅ 13 warnings (expected - sun.nio.ch.DirectBuffer internal API) - ---- - -## Summary - -**ALL PHASES COMPLETE:** Phases 0-5 ✅ -**PRODUCTION READY:** Yes ✅ -**TESTS PASSING:** 459/459 ✅ -**BUILD STATUS:** SUCCESS ✅ - -**Next Steps:** Version 1.0.0 release preparation (Phase 6 - deferred) diff --git a/JNI_MOCKABILITY_DESIGN.md b/JNI_MOCKABILITY_DESIGN.md new file mode 100644 index 0000000..361ea7a --- /dev/null +++ b/JNI_MOCKABILITY_DESIGN.md @@ -0,0 +1,355 @@ +# JNI Mockability Design - Clean Interface Abstraction + +**Goal:** Make all native calls mockable for unit testing without breaking existing API + +--- + +## Design: Internal JniAdapter with Package-Private Injection + +### 1. Create JniAdapter Interface (Package-Private) + +```java +package com.axonops.libre2.jni; + +/** + * Adapter interface for RE2 JNI operations. + * Package-private for testing - not part of public API. + */ +interface JniAdapter { + // Pattern lifecycle + long compile(String pattern, boolean caseSensitive); + void freePattern(long handle); + boolean patternOk(long handle); + String getError(); + String getPattern(long handle); + int numCapturingGroups(long handle); + long patternMemory(long handle); + + // Matching operations + boolean fullMatch(long handle, String text); + boolean partialMatch(long handle, String text); + boolean fullMatchDirect(long handle, long address, int length); + boolean partialMatchDirect(long handle, long address, int length); + + // Bulk operations + boolean[] fullMatchBulk(long handle, String[] texts); + boolean[] partialMatchBulk(long handle, String[] texts); + boolean[] fullMatchDirectBulk(long handle, long[] addresses, int[] lengths); + boolean[] partialMatchDirectBulk(long handle, long[] addresses, int[] lengths); + + // Capture groups + String[] extractGroups(long handle, String text); + String[][] extractGroupsBulk(long handle, String[] texts); + String[] extractGroupsDirect(long handle, long address, int length); + String[][] extractGroupsDirectBulk(long handle, long[] addresses, int[] lengths); + String[][] findAllMatches(long handle, String text); + String[][] findAllMatchesDirect(long handle, long address, int length); + String[] getNamedGroups(long handle); + + // Replace operations + String replaceFirst(long handle, String text, String replacement); + String replaceAll(long handle, String text, String replacement); + String[] replaceAllBulk(long handle, String[] texts, String replacement); + String replaceFirstDirect(long handle, long address, int length, String replacement); + String replaceAllDirect(long handle, long address, int length, String replacement); + String[] replaceAllDirectBulk(long handle, long[] addresses, int[] lengths, String replacement); +} +``` + +### 2. Production Implementation (Package-Private) + +```java +package com.axonops.libre2.jni; + +/** + * Production JNI adapter - delegates directly to RE2NativeJNI. + * Package-private - not part of public API. + */ +class DirectJniAdapter implements JniAdapter { + + // Singleton instance + static final DirectJniAdapter INSTANCE = new DirectJniAdapter(); + + private DirectJniAdapter() { + // Private constructor + } + + @Override + public long compile(String pattern, boolean caseSensitive) { + return RE2NativeJNI.compile(pattern, caseSensitive); + } + + @Override + public void freePattern(long handle) { + RE2NativeJNI.freePattern(handle); + } + + // ... delegate all 29 methods to RE2NativeJNI +} +``` + +### 3. Pattern Internal Field (Package-Private Injection Point) + +```java +package com.axonops.libre2.api; + +public final class Pattern implements AutoCloseable { + + // Package-private for testing - production uses singleton + final JniAdapter jni; + + private final long nativeHandle; + private final String pattern; + // ... other fields + + // PRIVATE constructor - used internally + private Pattern(JniAdapter jni, String pattern, boolean caseSensitive, PatternCache cache) { + this.jni = jni; + this.pattern = pattern; + this.cache = cache; + + // Compile using adapter + long handle = jni.compile(pattern, caseSensitive); + if (handle == 0 || !jni.patternOk(handle)) { + String error = jni.getError(); + throw new PatternCompilationException("Failed to compile pattern: " + error); + } + this.nativeHandle = handle; + // ... rest of initialization + } + + // PUBLIC API - unchanged, uses production adapter + public static Pattern compile(String pattern) { + return compile(pattern, true); + } + + public static Pattern compile(String pattern, boolean caseSensitive) { + // Production code uses singleton DirectJniAdapter + return compile(pattern, caseSensitive, DirectJniAdapter.INSTANCE); + } + + // PACKAGE-PRIVATE for testing - inject mock adapter + static Pattern compile(String pattern, boolean caseSensitive, JniAdapter jni) { + PatternCache cache = getGlobalCache(); + // ... cache lookup logic + return new Pattern(jni, pattern, caseSensitive, cache); + } + + // All operations use this.jni instead of RE2NativeJNI directly + public boolean match(String input) { + checkNotClosed(); + Objects.requireNonNull(input, "input cannot be null"); + + long startNanos = System.nanoTime(); + boolean result = jni.fullMatch(nativeHandle, input); // Uses adapter! + long durationNanos = System.nanoTime() - startNanos; + + // ... metrics recording + return result; + } + + // ... all other methods use this.jni +} +``` + +### 4. Test Usage - Clean and Powerful + +```java +package com.axonops.libre2.api; + +import com.axonops.libre2.jni.JniAdapter; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import static org.assertj.core.api.Assertions.*; +import static org.mockito.Mockito.*; + +class PatternUnitTest { + + @Test + void testMatch_callsCorrectJniMethod() { + // Create mock adapter + JniAdapter mockJni = mock(JniAdapter.class); + + // Setup expectations + when(mockJni.compile("test\\d+", true)).thenReturn(12345L); + when(mockJni.patternOk(12345L)).thenReturn(true); + when(mockJni.numCapturingGroups(12345L)).thenReturn(0); + when(mockJni.patternMemory(12345L)).thenReturn(1024L); + when(mockJni.fullMatch(12345L, "test123")).thenReturn(true); + + // Create pattern with mock adapter (package-private method) + Pattern pattern = Pattern.compile("test\\d+", true, mockJni); + + // Execute + boolean result = pattern.match("test123"); + + // Verify + assertThat(result).isTrue(); + verify(mockJni).compile("test\\d+", true); + verify(mockJni).fullMatch(12345L, "test123"); + verifyNoMoreInteractions(mockJni); + } + + @Test + void testReplaceAll_callsCorrectJniMethod() { + JniAdapter mockJni = mock(JniAdapter.class); + + when(mockJni.compile("\\d+", true)).thenReturn(67890L); + when(mockJni.patternOk(67890L)).thenReturn(true); + when(mockJni.numCapturingGroups(67890L)).thenReturn(0); + when(mockJni.patternMemory(67890L)).thenReturn(512L); + when(mockJni.replaceAll(67890L, "test123", "XXX")).thenReturn("testXXX"); + + Pattern pattern = Pattern.compile("\\d+", true, mockJni); + String result = pattern.replaceAll("test123", "XXX"); + + assertThat(result).isEqualTo("testXXX"); + verify(mockJni).replaceAll(67890L, "test123", "XXX"); + } + + @Test + void testBulkMatch_callsCorrectBulkJniMethod() { + JniAdapter mockJni = mock(JniAdapter.class); + + when(mockJni.compile("test", true)).thenReturn(11111L); + when(mockJni.patternOk(11111L)).thenReturn(true); + when(mockJni.numCapturingGroups(11111L)).thenReturn(0); + when(mockJni.patternMemory(11111L)).thenReturn(256L); + + String[] inputs = {"test1", "test2", "other"}; + boolean[] expected = {true, true, false}; + when(mockJni.fullMatchBulk(11111L, inputs)).thenReturn(expected); + + Pattern pattern = Pattern.compile("test", true, mockJni); + boolean[] results = pattern.matchAll(inputs); + + assertThat(results).isEqualTo(expected); + verify(mockJni).fullMatchBulk(11111L, inputs); + } +} +``` + +--- + +## Benefits of This Design + +### ✅ 1. Public API Unchanged +```java +// Users still write this - no breaking changes +Pattern p = Pattern.compile("test\\d+"); +boolean match = p.match("test123"); +``` + +### ✅ 2. Full Test Control +```java +// Tests can inject mock and verify exact calls +JniAdapter mock = mock(JniAdapter.class); +Pattern p = Pattern.compile("test", true, mock); +verify(mock).fullMatch(eq(12345L), eq("test123")); +``` + +### ✅ 3. Package-Private Design +- `JniAdapter` interface is NOT public +- `DirectJniAdapter` is NOT public +- Only `Pattern.compile(pattern, caseSensitive, JniAdapter)` is package-private +- Tests in same package can access it +- Users cannot misuse it + +### ✅ 4. Zero Runtime Overhead +- Production code uses singleton `DirectJniAdapter.INSTANCE` +- No interface overhead (JIT inlines static final calls) +- Same performance as direct static calls + +### ✅ 5. Comprehensive Test Coverage +Can now unit test: +- ✅ Parameter validation before JNI calls +- ✅ Metrics recording logic +- ✅ Resource tracking +- ✅ Error handling paths +- ✅ Bulk operation batching logic +- ✅ DirectByteBuffer address extraction +- ✅ Cache interaction logic + +--- + +## Implementation Strategy + +### Phase 2A: Create Abstraction (Before Test Migration) +1. Create `JniAdapter` interface (package-private) +2. Create `DirectJniAdapter` implementation (package-private) +3. Update `Pattern` to use `jni` field instead of `RE2NativeJNI` static calls +4. Update `Matcher`, `RE2` similarly +5. Run full integration test suite - should all pass (no behavior change) + +### Phase 2B: Test Migration (With Mockability) +6. Create new unit tests using mock JniAdapter +7. Migrate existing tests to appropriate directories +8. Verify all tests still pass + +--- + +## File Structure + +``` +libre2-core/src/main/java/com/axonops/libre2/jni/ +├── RE2NativeJNI.java (unchanged - native methods) +├── RE2LibraryLoader.java (unchanged - library loading) +├── JniAdapter.java (NEW - package-private interface) +└── DirectJniAdapter.java (NEW - package-private singleton) + +libre2-core/src/test/java/com/axonops/libre2/api/ +├── PatternUnitTest.java (NEW - mocked JNI tests) +├── MatcherUnitTest.java (NEW - mocked JNI tests) +└── RE2UnitTest.java (NEW - mocked JNI tests) +``` + +--- + +## Example: Testing Metrics Recording Without Native Library + +```java +@Test +void testMatchAll_recordsCorrectMetrics() { + JniAdapter mockJni = mock(JniAdapter.class); + RE2MetricsRegistry mockMetrics = mock(RE2MetricsRegistry.class); + + // Setup + when(mockJni.compile("test", true)).thenReturn(123L); + when(mockJni.patternOk(123L)).thenReturn(true); + when(mockJni.numCapturingGroups(123L)).thenReturn(0); + when(mockJni.patternMemory(123L)).thenReturn(100L); + when(mockJni.fullMatchBulk(eq(123L), any())).thenReturn(new boolean[]{true, false, true}); + + // Create pattern with mock metrics + PatternCache cache = new PatternCache(RE2Config.builder() + .metricsRegistry(mockMetrics) + .build()); + Pattern pattern = Pattern.compile("test", true, mockJni, cache); + + // Execute + String[] inputs = {"test1", "test2", "test3"}; + pattern.matchAll(inputs); + + // Verify metrics (without running native code!) + verify(mockMetrics).incrementCounter("re2.matching.operations.total.count", 3); + verify(mockMetrics).incrementCounter("re2.matching.bulk.operations.total.count", 1); + verify(mockMetrics).incrementCounter("re2.matching.bulk.items.total.count", 3); + verify(mockMetrics, times(2)).recordTimer(eq("re2.matching.latency"), anyLong()); +} +``` + +--- + +## Decision Point + +**Do you approve this design?** + +If yes, I'll implement it in Phase 2A before any test migration. This gives us: +- ✅ Full mockability of all native calls +- ✅ Ability to assert correct JNI parameters +- ✅ Unit tests for all business logic +- ✅ No public API changes +- ✅ No runtime overhead + +**Alternative:** If you have a different approach in mind, I'm open to it. The key requirement is: **mock all native calls to verify correct parameters**. diff --git a/JNI_MOCKABILITY_DESIGN_V2.md b/JNI_MOCKABILITY_DESIGN_V2.md new file mode 100644 index 0000000..b248e14 --- /dev/null +++ b/JNI_MOCKABILITY_DESIGN_V2.md @@ -0,0 +1,397 @@ +# JNI Mockability Design V2 - Package-Private Enforcement + +**Improvement:** Make RE2NativeJNI package-private so ONLY DirectJniAdapter can access it + +--- + +## Updated Design: Compile-Time Enforcement + +### 1. RE2NativeJNI - Package-Private Native Methods + +```java +package com.axonops.libre2.jni; + +/** + * JNI bindings to RE2 native library. + * + *

IMPORTANT: All methods are package-private. External code must use + * Pattern/Matcher/RE2 API. Direct JNI access is only available to DirectJniAdapter. + * + *

This design enables: + *

    + *
  • Mockability - DirectJniAdapter implements JniAdapter interface
  • + *
  • Encapsulation - No direct JNI calls from API classes
  • + *
  • Testability - Tests can inject mock JniAdapter
  • + *
+ */ +final class RE2NativeJNI { + + private RE2NativeJNI() { + // Utility class - prevent instantiation + } + + // ========== Pattern Lifecycle ========== + + /** + * Compile a pattern. Package-private - use via DirectJniAdapter only. + */ + static native long compile(String pattern, boolean caseSensitive); + + /** + * Free compiled pattern. Package-private - use via DirectJniAdapter only. + */ + static native void freePattern(long handle); + + /** + * Check if pattern is valid. Package-private - use via DirectJniAdapter only. + */ + static native boolean patternOk(long handle); + + /** + * Get last compilation error. Package-private - use via DirectJniAdapter only. + */ + static native String getError(); + + // ... all 29 methods as package-private (no visibility modifier) + + // ========== Matching Operations ========== + + static native boolean fullMatch(long handle, String text); + static native boolean partialMatch(long handle, String text); + static native boolean fullMatchDirect(long handle, long address, int length); + static native boolean partialMatchDirect(long handle, long address, int length); + + // ========== Bulk Operations ========== + + static native boolean[] fullMatchBulk(long handle, String[] texts); + static native boolean[] partialMatchBulk(long handle, String[] texts); + static native boolean[] fullMatchDirectBulk(long handle, long[] addresses, int[] lengths); + static native boolean[] partialMatchDirectBulk(long handle, long[] addresses, int[] lengths); + + // ========== Capture Groups ========== + + static native String[] extractGroups(long handle, String text); + static native String[][] extractGroupsBulk(long handle, String[] texts); + static native String[] extractGroupsDirect(long handle, long address, int length); + static native String[][] extractGroupsDirectBulk(long handle, long[] addresses, int[] lengths); + static native String[][] findAllMatches(long handle, String text); + static native String[][] findAllMatchesDirect(long handle, long address, int length); + static native String[] getNamedGroups(long handle); + + // ========== Replace Operations ========== + + static native String replaceFirst(long handle, String text, String replacement); + static native String replaceAll(long handle, String text, String replacement); + static native String[] replaceAllBulk(long handle, String[] texts, String replacement); + static native String replaceFirstDirect(long handle, long address, int length, String replacement); + static native String replaceAllDirect(long handle, long address, int length, String replacement); + static native String[] replaceAllDirectBulk(long handle, long[] addresses, int[] lengths, String replacement); + + // ========== Utility Methods ========== + + static native String quoteMeta(String text); + static native int[] getProgramFanout(long handle); + static native long getProgramSize(long handle); +} +``` + +### 2. JniAdapter Interface (Package-Private) + +```java +package com.axonops.libre2.jni; + +/** + * Adapter interface for RE2 JNI operations. + * Enables mocking for unit tests while maintaining production performance. + * + *

Package-private: Not part of public API. Used internally by Pattern/Matcher/RE2. + */ +interface JniAdapter { + // Pattern lifecycle + long compile(String pattern, boolean caseSensitive); + void freePattern(long handle); + boolean patternOk(long handle); + String getError(); + String getPattern(long handle); + int numCapturingGroups(long handle); + long patternMemory(long handle); + + // Matching operations + boolean fullMatch(long handle, String text); + boolean partialMatch(long handle, String text); + boolean fullMatchDirect(long handle, long address, int length); + boolean partialMatchDirect(long handle, long address, int length); + + // Bulk operations + boolean[] fullMatchBulk(long handle, String[] texts); + boolean[] partialMatchBulk(long handle, String[] texts); + boolean[] fullMatchDirectBulk(long handle, long[] addresses, int[] lengths); + boolean[] partialMatchDirectBulk(long handle, long[] addresses, int[] lengths); + + // Capture groups + String[] extractGroups(long handle, String text); + String[][] extractGroupsBulk(long handle, String[] texts); + String[] extractGroupsDirect(long handle, long address, int length); + String[][] extractGroupsDirectBulk(long handle, long[] addresses, int[] lengths); + String[][] findAllMatches(long handle, String text); + String[][] findAllMatchesDirect(long handle, long address, int length); + String[] getNamedGroups(long handle); + + // Replace operations + String replaceFirst(long handle, String text, String replacement); + String replaceAll(long handle, String text, String replacement); + String[] replaceAllBulk(long handle, String[] texts, String replacement); + String replaceFirstDirect(long handle, long address, int length, String replacement); + String replaceAllDirect(long handle, long address, int length, String replacement); + String[] replaceAllDirectBulk(long handle, long[] addresses, int[] lengths, String replacement); + + // Utility methods + String quoteMeta(String text); + int[] getProgramFanout(long handle); + long getProgramSize(long handle); +} +``` + +### 3. DirectJniAdapter (Package-Private, Same Package) + +```java +package com.axonops.libre2.jni; + +/** + * Production JNI adapter - delegates to package-private RE2NativeJNI. + * + *

Singleton instance used by all Pattern/Matcher/RE2 instances in production. + * Tests can inject mock JniAdapter instead. + * + *

Package-private: Not part of public API. Accessed via Pattern injection. + */ +final class DirectJniAdapter implements JniAdapter { + + /** + * Singleton instance - used in production. + * Package-private so Pattern can access it. + */ + static final DirectJniAdapter INSTANCE = new DirectJniAdapter(); + + private DirectJniAdapter() { + // Private constructor - singleton pattern + } + + // ========== Pattern Lifecycle ========== + + @Override + public long compile(String pattern, boolean caseSensitive) { + return RE2NativeJNI.compile(pattern, caseSensitive); // ✅ Same package - accessible + } + + @Override + public void freePattern(long handle) { + RE2NativeJNI.freePattern(handle); // ✅ Same package - accessible + } + + @Override + public boolean patternOk(long handle) { + return RE2NativeJNI.patternOk(handle); // ✅ Same package - accessible + } + + @Override + public String getError() { + return RE2NativeJNI.getError(); // ✅ Same package - accessible + } + + // ... delegate all 29 methods to RE2NativeJNI + + // All calls work because DirectJniAdapter is in same package as RE2NativeJNI +} +``` + +### 4. Pattern Uses JniAdapter (Different Package) + +```java +package com.axonops.libre2.api; + +import com.axonops.libre2.jni.JniAdapter; +import com.axonops.libre2.jni.DirectJniAdapter; + +public final class Pattern implements AutoCloseable { + + // Package-private JniAdapter field + final JniAdapter jni; + + private final long nativeHandle; + private final String pattern; + // ... other fields + + // PRIVATE constructor + private Pattern(JniAdapter jni, String pattern, boolean caseSensitive, PatternCache cache) { + this.jni = jni; + this.pattern = pattern; + this.cache = cache; + + // Compile using adapter + long handle = jni.compile(pattern, caseSensitive); // ✅ Goes through interface + + // ❌ CANNOT do this - RE2NativeJNI is package-private in different package: + // long handle = RE2NativeJNI.compile(pattern, caseSensitive); // COMPILE ERROR! + + if (handle == 0 || !jni.patternOk(handle)) { + String error = jni.getError(); + throw new PatternCompilationException("Failed to compile pattern: " + error); + } + this.nativeHandle = handle; + // ... + } + + // PUBLIC API - uses production singleton adapter + public static Pattern compile(String pattern) { + return compile(pattern, true); + } + + public static Pattern compile(String pattern, boolean caseSensitive) { + return compile(pattern, caseSensitive, DirectJniAdapter.INSTANCE); + } + + // PACKAGE-PRIVATE - tests inject mock adapter + static Pattern compile(String pattern, boolean caseSensitive, JniAdapter jni) { + PatternCache cache = getGlobalCache(); + return new Pattern(jni, pattern, caseSensitive, cache); + } + + // All operations use this.jni (enforced at compile-time) + public boolean match(String input) { + checkNotClosed(); + Objects.requireNonNull(input, "input cannot be null"); + + long startNanos = System.nanoTime(); + boolean result = jni.fullMatch(nativeHandle, input); // ✅ Must use adapter + // boolean result = RE2NativeJNI.fullMatch(...); // ❌ COMPILE ERROR! + long durationNanos = System.nanoTime() - startNanos; + + // ... metrics + return result; + } +} +``` + +--- + +## Benefits of Package-Private RE2NativeJNI + +### ✅ 1. Compile-Time Enforcement + +**Before (public RE2NativeJNI):** +```java +// Pattern.java - could accidentally bypass abstraction +boolean result = RE2NativeJNI.fullMatch(handle, text); // ✅ Compiles (bad design) +``` + +**After (package-private RE2NativeJNI):** +```java +// Pattern.java - MUST use adapter +boolean result = RE2NativeJNI.fullMatch(handle, text); // ❌ COMPILE ERROR! +boolean result = jni.fullMatch(handle, text); // ✅ Must use interface +``` + +### ✅ 2. Clear Separation of Concerns + +``` +com.axonops.libre2.jni/ (JNI layer - isolated) +├── RE2NativeJNI.java (package-private native methods) +├── JniAdapter.java (package-private interface) +└── DirectJniAdapter.java (package-private singleton) + +com.axonops.libre2.api/ (Public API - uses interface) +├── Pattern.java (uses JniAdapter, cannot access RE2NativeJNI) +├── Matcher.java (uses JniAdapter, cannot access RE2NativeJNI) +└── RE2.java (uses JniAdapter, cannot access RE2NativeJNI) +``` + +### ✅ 3. Impossible to Bypass Abstraction + +**Users cannot do this:** +```java +// This would compile if RE2NativeJNI were public +long handle = RE2NativeJNI.compile("test", true); // ❌ COMPILE ERROR - package-private +RE2NativeJNI.freePattern(handle); // ❌ COMPILE ERROR - package-private +``` + +**Must use public API:** +```java +Pattern pattern = Pattern.compile("test"); // ✅ Only way +``` + +### ✅ 4. Tests Still Work (Same Package) + +```java +package com.axonops.libre2.api; // Different package from RE2NativeJNI + +import com.axonops.libre2.jni.JniAdapter; +import org.mockito.Mockito; + +class PatternUnitTest { + @Test + void testMatch() { + JniAdapter mock = mock(JniAdapter.class); + when(mock.compile("test", true)).thenReturn(123L); + when(mock.fullMatch(123L, "test")).thenReturn(true); + + Pattern p = Pattern.compile("test", true, mock); // ✅ Package-private method + boolean result = p.match("test"); + + verify(mock).fullMatch(123L, "test"); // ✅ Can verify interface calls + } +} +``` + +--- + +## Implementation Changes + +### Change 1: RE2NativeJNI Visibility + +```java +// BEFORE (current): +public final class RE2NativeJNI { + public static native long compile(String pattern, boolean caseSensitive); + // ... +} + +// AFTER (package-private): +final class RE2NativeJNI { + static native long compile(String pattern, boolean caseSensitive); + // ... all methods package-private +} +``` + +### Change 2: Pattern/Matcher/RE2 MUST Use JniAdapter + +```java +// BEFORE: +boolean result = RE2NativeJNI.fullMatch(handle, text); + +// AFTER: +boolean result = jni.fullMatch(handle, text); +``` + +**Compiler enforces this change** - any direct RE2NativeJNI calls in Pattern/Matcher/RE2 will fail to compile. + +--- + +## Summary + +**Your suggestion is perfect!** Making RE2NativeJNI package-private: + +1. ✅ **Works with native methods** - Visibility doesn't affect JNI name mangling +2. ✅ **Enforces abstraction** - Compile error if bypassed +3. ✅ **Zero runtime cost** - Same performance as direct calls +4. ✅ **Enables testing** - Mock JniAdapter interface +5. ✅ **Clean architecture** - JNI layer isolated in one package + +**Next Steps:** +1. Implement package-private RE2NativeJNI +2. Create JniAdapter interface and DirectJniAdapter +3. Update Pattern/Matcher/RE2 to use JniAdapter field +4. Verify all existing tests pass (integration tests unchanged) +5. Add new unit tests with mocked JniAdapter + +**Approved for implementation?** diff --git a/JNI_OPTIMIZATION_CONCLUSION.md b/JNI_OPTIMIZATION_CONCLUSION.md deleted file mode 100644 index 7ae24f2..0000000 --- a/JNI_OPTIMIZATION_CONCLUSION.md +++ /dev/null @@ -1,237 +0,0 @@ -# JNI Optimization Attempt - Conclusion - -**Date:** 2025-11-25 -**Branch:** feature/jni-optimization (deleted) -**Outcome:** ❌ **Reverted** - No performance gain, Unicode encoding issues -**Token Used:** 450k / 1M -**Decision:** Keep existing GetStringUTFChars approach - ---- - -## Executive Summary - -Attempted to optimize JNI String transfer based on RocksDB research (GetByteArrayRegion vs GetStringUTFChars). **Optimization failed** due to: - -1. **No Performance Improvement:** String→byte[] conversion overhead canceled out JNI gains -2. **Modified UTF-8 Mismatch:** Pattern compilation uses Modified UTF-8, inputs use Standard UTF-8 -3. **Edge Case Bugs:** Empty strings, null handling, Unicode characters all broke - -**Baseline:** 2.76ms for 10k strings, 3.6M matches/sec -**Optimized:** 2.79ms for 10k strings, 3.5M matches/sec (SLOWER) - -**RocksDB's findings don't apply when you START with Java Strings** - the conversion cost dominates. - ---- - -## What Was Attempted - -### Implementation - -**Added 10 new JNI methods:** -- fullMatchBytes, partialMatchBytes -- fullMatchBulkBytes, partialMatchBulkBytes -- extractGroupsBytes, extractGroupsBulkBytes -- findAllMatchesBytes -- replaceFirstBytes, replaceAllBytes, replaceAllBulkBytes - -**Modified 9 Pattern.java methods** to convert String→byte[] before JNI call - -**C++ Implementation:** -- Used GetByteArrayRegion (recommended by RocksDB) -- Stack allocation for small strings (<8KB) -- Heap allocation for large strings - ---- - -## Why It Failed - -### 1. String→byte[] Conversion Overhead - -**RocksDB Context:** -- They START with byte[] data (database keys/values) -- Direct byte[] → JNI is fast -- String → byte[] → JNI is slow (extra conversion) - -**Our Context:** -- We START with Java Strings (user regex patterns) -- Must convert String → byte[] in Java -- Conversion cost = 100-200ns per string -- JNI improvement = 300ns (GetStringUTFChars) → 100ns (GetByteArrayRegion) = 200ns saved -- **Net: Lost 100-200ns, gained 200ns = WASH** - -### 2. Modified UTF-8 vs Standard UTF-8 Mismatch - -**Problem:** -```java -// Pattern compilation -Pattern.compile("😀") // Uses GetStringUTFChars → Modified UTF-8 in RE2 - -// Input matching -pattern.matchAll(["test😀test"]) // Uses getBytes(UTF_8) → Standard UTF-8 -``` - -**Result:** Pattern and input use different UTF-8 encodings - -**Impact:** -- ASCII: Works (same in both encodings) -- BMP Unicode (Chinese, Arabic): **Should work but failed** (unknown cause) -- Emoji/Supplementary (U+10000+): Definitely broken (6 bytes vs 4 bytes) - -### 3. StringPiece Lifetime Bugs - -**Issue:** StringPiece points to temporary buffer: -```cpp -jbyte stackBuf[8192]; -env->GetByteArrayRegion(bytes, 0, length, stackBuf); -re2::StringPiece input((const char*)stackBuf, length); -// ... Match happens ... -// stackBuf goes out of scope -// Later: groups[i].data() points to freed memory! -``` - -**Fix Required:** Immediately convert to std::string after Match() - -**3 rounds of bugs:** -- Round 1: StringPiece lifetime in findAllMatchesBytes -- Round 2: StringPiece lifetime in extractGroupsBytes -- Round 3: Empty string handling (groups[0] == null) -- Round 4: Null elements in String[] arrays -- Round 5: Unicode encoding failures - -**Too many edge cases** for marginal/zero gain. - ---- - -## Performance Measurements - -### Baseline (GetStringUTFChars) -``` -Filter Performance (10,000 strings): -- Duration: 2.76 ms -- Throughput: 3,628,117 matches/sec -- Per-match: 0.276 μs - -Map Filter (10,000 entries): -- Duration: 4.15 ms - -Bulk vs Individual: -- Bulk: 2.32 ms -- Individual: 2.57 ms -- Speedup: 1.1x -``` - -### Optimized (GetByteArrayRegion + String→byte[]) -``` -Filter Performance (10,000 strings): -- Duration: 2.79 ms (SLOWER!) -- Throughput: 3,585,086 matches/sec -- Per-match: 0.279 μs - -Map Filter (10,000 entries): -- Duration: 4.51 ms (SLOWER!) -``` - -**Net Impact:** -3% performance (worse than baseline) - ---- - -## Lessons Learned - -### When RocksDB Optimizations Apply - -✅ **Good for:** -- Native data sources (byte[] from databases, files, network) -- Pre-existing byte[] arrays -- DirectByteBuffer (zero-copy) - -❌ **Bad for:** -- Java String inputs (must convert first) -- Mixed UTF-8 encodings (Modified vs Standard) -- Simple/short strings (conversion overhead dominates) - -### What Actually Works in Our Library - -✅ **DirectByteBuffer zero-copy paths** - Already implemented, optimal -✅ **Bulk APIs** - Single JNI crossing for many operations -✅ **Pattern caching** - Avoid recompilation -✅ **GetStringUTFChars** - JVM optimizes this well, handles all Unicode correctly - ---- - -## Technical Details - -### Modified UTF-8 (GetStringUTFChars) -- NULL: 0xC0 0x80 (not 0x00) -- Supplementary chars (U+10000+): 6 bytes (surrogate pair encoding) -- Used by JNI for all String operations - -### Standard UTF-8 (getBytes(UTF_8)) -- NULL: 0x00 -- Supplementary chars: 4 bytes (direct encoding) -- Used by Files, Network, StandardCharsets - -**Pattern compiled with one, input with the other = mismatch** - ---- - -## Recommendations - -### For libre2-java - -1. **Keep existing implementation** - GetStringUTFChars throughout -2. **Focus on bulk APIs** - This is where real gains are (already have it) -3. **Keep DirectByteBuffer paths** - Already optimal for off-heap data -4. **Don't add byte[] variants** - No benefit, adds complexity - -### For Future Optimization Attempts - -1. **Measure first** - Establish baseline before coding -2. **Consider the full path** - String→byte[] conversion isn't free -3. **Test Unicode** - GetStringUTFChars handles all cases correctly -4. **Start small** - One method, measure, then expand if beneficial - ---- - -## What to Keep - -**Analysis Documents (valuable):** -- ✅ JNI_OPTIMIZATION_ANALYSIS.md - Good reference for future -- ✅ BASELINE_PERFORMANCE.md - Performance baseline documented - -**Code:** -- ❌ byte[] JNI methods - Removed (no benefit) -- ❌ Native implementations - Removed (bugs, no gain) -- ✅ Existing GetStringUTFChars code - Keep as-is - ---- - -## Final Status - -**Branch:** development -**Tests:** 459/459 passing ✅ -**Build:** SUCCESS ✅ -**Performance:** Baseline maintained (3.6M matches/sec) - -**Feature branch deleted:** feature/jni-optimization -**Time invested:** ~4 hours -**Outcome:** Valuable learning, correct decision to revert - ---- - -## Next Steps - -**Library is production-ready** with current implementation: -- ✅ 459 tests passing -- ✅ 55 metrics instrumented -- ✅ All phases (0-5) complete -- ✅ Zero-copy DirectByteBuffer support -- ✅ Comprehensive bulk APIs - -**Potential areas for improvement:** -1. Documentation (QUICKSTART.md, examples) -2. Version 1.0.0 release preparation -3. Additional integration tests -4. Performance profiling in real Cassandra workload -5. JMH micro-benchmarks for specific operations - -**Recommendation:** Focus on documentation and release preparation, not micro-optimizations. diff --git a/METRIC_RENAMING_MAP.txt b/METRIC_RENAMING_MAP.txt deleted file mode 100644 index e180501..0000000 --- a/METRIC_RENAMING_MAP.txt +++ /dev/null @@ -1,33 +0,0 @@ -# Systematic Metric Renaming - -## Counters (add .total.count suffix) -patterns.compiled → patterns.compiled.total.count -patterns.cache_hits → patterns.cache.hits.total.count -patterns.cache_misses → patterns.cache.misses.total.count -patterns.invalid_recompiled → patterns.invalid.recompiled.total.count -cache.evictions_lru → cache.evictions.lru.total.count -cache.evictions_idle → cache.evictions.idle.total.count -cache.evictions_deferred → cache.evictions.deferred.total.count -matching.operations → matching.operations.total.count -errors.compilation_failed → errors.compilation.failed.total.count -errors.native_library → errors.native_library.total.count -errors.resource_exhausted → errors.resource.exhausted.total.count - -## Timers (rename to .latency) -patterns.compilation_time → patterns.compilation.latency -matching.full_match → matching.full_match.latency -matching.partial_match → matching.partial_match.latency - -## Gauges - Current State (add .current suffix) -cache.size → cache.patterns.current.count -cache.native_memory_bytes → cache.native_memory.current.bytes -cache.native_memory_peak_bytes → cache.native_memory.peak.bytes -resources.patterns_active → resources.patterns.active.current.count -resources.matchers_active → resources.matchers.active.current.count -cache.deferred.patterns.count → cache.deferred.patterns.current.count -cache.deferred.native_memory.bytes → cache.deferred.native_memory.current.bytes -cache.deferred.native_memory.peak.bytes → cache.deferred.native_memory.peak.bytes - -## Gauges - Cumulative (should be Counters!) -resources.patterns_freed → resources.patterns.freed.total.count (CHANGE TO COUNTER) -resources.matchers_freed → resources.matchers.freed.total.count (CHANGE TO COUNTER) diff --git a/MOCKABILITY_ASSESSMENT.md b/MOCKABILITY_ASSESSMENT.md new file mode 100644 index 0000000..53b22df --- /dev/null +++ b/MOCKABILITY_ASSESSMENT.md @@ -0,0 +1,262 @@ +# Mockability and Unit Test Strategy Assessment + +**Date:** 2025-11-26 +**Purpose:** Corrected analysis of what can be unit tested without native library + +--- + +## The Static Method Problem + +**All JNI methods are `public static native`:** +```java +public class RE2NativeJNI { + public static native long compile(String pattern, boolean caseSensitive); + public static native void freePattern(long handle); + public static native boolean fullMatch(long handle, String text); + // ... 26 more static native methods +} +``` + +**Implications:** +- Cannot use traditional interface-based dependency injection +- Mocking static methods requires: + - **Mockito 3.4+ with mockito-inline** (can mock statics in JUnit 5) + - **PowerMock** (deprecated, poor JUnit 5 support) +- Most Pattern/Matcher/RE2 logic IS the native call - minimal business logic to test + +--- + +## What's Actually Unit-Testable? + +### Files That DON'T Call Native Code (16 files) + +**Pure Java, No Native Dependencies:** + +#### 1. Configuration & Builders ✅ +- `RE2Config.java` - Builder pattern, validation +- `MetricNames.java` - String constants +- **Already tested:** `ConfigurationTest.java` (14 tests) ✅ + +#### 2. Metrics Abstractions ✅ +- `RE2MetricsRegistry.java` - Interface +- `NoOpMetricsRegistry.java` - No-op implementation +- `DropwizardMetricsAdapter.java` - Adapter (can mock MetricRegistry) +- **Already tested:** `TimerHistogramTest.java` (4 tests) ✅ +- **Testable:** Adapter logic without Dropwizard + +#### 3. Exception Classes ✅ +- `RE2Exception.java` (sealed base) +- `PatternCompilationException.java` +- `NativeLibraryException.java` +- `ResourceException.java` +- `RE2TimeoutException.java` +- **Already tested:** Implicitly in integration tests +- **Testable:** Exception hierarchies, messages, causes + +#### 4. Value Objects ✅ +- `MatchResult.java` - Holds capture groups, implements AutoCloseable +- `CacheStatistics.java` - Immutable stats record +- **Already tested:** `CaptureGroupsTest.java` (31 tests) ✅ +- **Testable:** MatchResult lifecycle, closed state checking + +#### 5. Utilities ✅ +- `PatternHasher.java` - Pattern hash computation +- `ResourceTracker.java` - Resource tracking logic +- **Testable:** Hash consistency, resource accounting + +#### 6. Cache Logic (Partially Testable) +- `PatternCache.java` - Cache management +- `IdleEvictionTask.java` - Background eviction +- **Issue:** Cache stores compiled Patterns (which need native library) +- **Mockable:** LRU eviction logic, idle timeout calculation, statistics +- **Already tested:** `CacheTest.java`, `IdleEvictionTest.java` (integration tests) + +--- + +## What REQUIRES Native Library? + +### Files That Call RE2NativeJNI (5 files) + +1. **Pattern.java** - Wraps native pattern, all operations call JNI +2. **Matcher.java** - Iterator over Pattern operations +3. **RE2.java** - Static convenience methods (all delegate to Pattern) +4. **RE2LibraryLoader.java** - Loads native library +5. **RE2NativeJNI.java** - JNI method declarations + +**Why integration tests are necessary:** +- Pattern compilation, matching, replacement = native operations +- Cannot mock without significant refactoring +- Business logic is minimal (metrics, validation, resource tracking) + +--- + +## Revised Unit vs Integration Test Strategy + +### True Unit Tests (No Native Library Required) + +**Current Status:** 4 test classes qualify as true unit tests + +1. ✅ **ConfigurationTest.java** (14 tests) + - Tests RE2Config builder + - No Pattern creation, no native calls + +2. ✅ **TimerHistogramTest.java** (4 tests) + - Tests pure Java histogram logic + - No native dependencies + +3. ✅ **BulkMatchingTypeSafetyTest.java** (13 tests) + - Tests type safety, null handling + - **WAIT:** Does this create Patterns? Need to verify + +4. ✅ **RE2MetricsConfigTest.java** (6 tests) [in libre2-dropwizard] + - Tests config factory methods + - No Pattern creation + +**Candidates for Unit Testing (with refactoring):** + +1. **Exception hierarchy tests** - Create new test class +2. **PatternHasher tests** - Create new test class +3. **ResourceTracker tests** - Create new test class (or mock Pattern) +4. **MatchResult lifecycle tests** - Already covered in CaptureGroupsTest +5. **Cache eviction logic** - Requires mocking Pattern creation + +### Integration Tests (Require Native Library) + +**All tests that:** +- Compile patterns (Pattern.compile()) +- Match text (Pattern.match(), find(), etc.) +- Use JNI layer (RE2NativeJNITest) +- Test metrics with real operations +- Test cache with real Patterns + +**Count:** ~370 tests (vast majority) + +--- + +## Mocking Strategy Assessment + +### Option 1: Mock Static Methods with Mockito-Inline ❌ + +**Approach:** +```java +@ExtendWith(MockitoExtension.class) +class PatternUnitTest { + @Test + void testSomething() { + try (MockedStatic mocked = mockStatic(RE2NativeJNI.class)) { + mocked.when(() -> RE2NativeJNI.compile("test", true)).thenReturn(12345L); + // Test Pattern logic + } + } +} +``` + +**Problems:** +- Requires mockito-inline (adds dependency) +- Verbose setup for every test +- Most Pattern logic IS the native call +- Little business logic to test independently + +**Verdict:** Not worth the complexity for minimal gain + +### Option 2: Introduce Abstraction Layer ❌ + +**Approach:** +```java +interface JniAdapter { + long compile(String pattern, boolean caseSensitive); + void freePattern(long handle); + // ... 27 more methods +} + +class DirectJniAdapter implements JniAdapter { + public long compile(String pattern, boolean caseSensitive) { + return RE2NativeJNI.compile(pattern, caseSensitive); + } + // ... +} + +// Pattern takes JniAdapter in constructor +class Pattern { + private final JniAdapter jni; + Pattern(JniAdapter jni, ...) { this.jni = jni; } +} +``` + +**Problems:** +- Invasive refactoring (29 methods to wrap) +- Breaks existing API (Pattern constructor changes) +- Adds complexity for every caller +- Testing benefit is minimal + +**Verdict:** Too invasive, not worth it + +### Option 3: Focus on Pure Java Components ✅ + +**Approach:** +- Unit test what doesn't need mocking (Config, Metrics, Exceptions, Utilities) +- Integration test everything that touches native code +- Accept that most tests require native library + +**Benefits:** +- Clean separation of concerns +- No mocking complexity +- Integration tests already comprehensive (459 tests) +- Can still add unit tests for pure Java components + +**Verdict:** This is the right approach ✅ + +--- + +## Recommendations + +### Phase 3: Unit Test Foundation + +**DO:** +1. ✅ Create unit tests for pure Java components: + - Exception hierarchy tests + - PatternHasher tests (hash consistency) + - ResourceTracker tests (if mockable) + - DropwizardMetricsAdapter tests (mock MetricRegistry) + +2. ✅ Separate existing unit tests from integration tests: + - Move ConfigurationTest to src/test/java (unit) + - Move TimerHistogramTest to src/test/java (unit) + - Verify BulkMatchingTypeSafetyTest doesn't create Patterns + +3. ✅ Document what's unit vs integration testable + +**DON'T:** +- ❌ Introduce JniAdapter abstraction (too invasive) +- ❌ Mock static RE2NativeJNI methods (too complex) +- ❌ Try to unit test Pattern/Matcher/RE2 without native library + +### The Reality + +**Most of this library IS integration testing by nature:** +- Core functionality is native regex matching +- Java layer is thin wrapper with metrics/caching +- Integration tests are comprehensive (459 tests) +- Pure unit tests have limited scope (~20-30 tests max) + +**This is OK!** The library's value IS the native integration. + +--- + +## Updated Test Classification + +| Type | Count | Mockable? | Strategy | +|------|-------|-----------|----------| +| **Pure Unit Tests** | 4-6 | ✅ No mocking needed | Keep in src/test/java | +| **Integration Tests** | ~370 | ❌ Require native lib | Move to src/integration-test/java | +| **Performance Tests** | 2 | ❌ Require native lib | Move to perf-test module | +| **Stress Tests** | 4 | ❌ Require native lib | Move to perf-test module | + +--- + +**Conclusion:** Original analysis was incomplete. Static native methods are not practically mockable. Focus on: +1. Pure Java component unit tests +2. Comprehensive integration tests (already have 370+) +3. Clear separation of test types + +**End of Corrected Assessment** diff --git a/PHASE1_COVERAGE_ANALYSIS.md b/PHASE1_COVERAGE_ANALYSIS.md deleted file mode 100644 index 04d4493..0000000 --- a/PHASE1_COVERAGE_ANALYSIS.md +++ /dev/null @@ -1,192 +0,0 @@ -# Phase 1 Bulk Matching - Coverage Analysis - -**Analyzed:** 2025-11-25 -**Token:** 502k / 1M - ---- - -## Current Coverage - -### Core Bulk Matching Methods - -**1. matchAll Variants:** -- ✅ `matchAll(Collection)` → delegates to matchAll(String[]) -- ✅ `matchAll(String[])` → **INSTRUMENTED** with MATCHING_BULK_* metrics -- ✅ `matchAll(long[], int[])` → **INSTRUMENTED** with MATCHING_BULK_ZERO_COPY_* metrics -- ❌ **MISSING:** `matchAll(ByteBuffer[])` - should accept array of ByteBuffers - -### Filter Operations - -**2. filter/filterNot:** -- ✅ `filter(Collection)` → delegates to matchAll(String[]) -- ✅ `filterNot(Collection)` → delegates to matchAll(String[]) -- ❌ **MISSING:** Zero-copy variants (ByteBuffer[], address/length arrays) - -### In-Place Operations - -**3. retainMatches/removeMatches:** -- ✅ `retainMatches(Collection)` → delegates to matchAll(String[]) -- ✅ `removeMatches(Collection)` → delegates to matchAll(String[]) -- ❌ **MISSING:** Zero-copy variants - -### Map Operations - -**4. filterByKey/filterByValue:** -- ✅ `filterByKey(Map)` → delegates to matchAll(String[]) -- ✅ `filterByValue(Map)` → delegates to matchAll(String[]) -- ✅ `filterNotByKey(Map)` → delegates to matchAll(String[]) -- ✅ `filterNotByValue(Map)` → delegates to matchAll(String[]) -- ❌ **MISSING:** Zero-copy variants (for Map etc.) - -**5. retainMatchesByKey/Value, removeMatchesByKey/Value:** -- ✅ All 4 methods delegate to matchAll(String[]) -- ❌ **MISSING:** Zero-copy variants - ---- - -## Metrics Instrumentation Status - -### ✅ Complete and Correct - -**matchAll(String[]):** -```java -// Global metrics -metrics.incrementCounter(MATCHING_OPERATIONS, inputs.length); -metrics.recordTimer(MATCHING_LATENCY, perItemNanos); -metrics.recordTimer(MATCHING_FULL_MATCH_LATENCY, perItemNanos); - -// Specific bulk metrics -metrics.incrementCounter(MATCHING_BULK_OPERATIONS); -metrics.incrementCounter(MATCHING_BULK_ITEMS, inputs.length); -metrics.recordTimer(MATCHING_BULK_LATENCY, perItemNanos); -``` -**Status:** ✅ CORRECT - records global + specific, per-item latency - -**matchAll(long[], int[]):** -```java -// Global metrics -metrics.incrementCounter(MATCHING_OPERATIONS, addresses.length); -metrics.recordTimer(MATCHING_LATENCY, perItemNanos); -metrics.recordTimer(MATCHING_FULL_MATCH_LATENCY, perItemNanos); - -// Specific bulk zero-copy metrics -metrics.incrementCounter(MATCHING_BULK_ZERO_COPY_OPERATIONS); -metrics.incrementCounter(MATCHING_BULK_ITEMS, addresses.length); -metrics.recordTimer(MATCHING_BULK_ZERO_COPY_LATENCY, perItemNanos); -``` -**Status:** ✅ CORRECT - records global + specific zero-copy, per-item latency - -**All filter/map/retain/remove methods:** -- Delegate to matchAll(String[]) -- **Status:** ✅ CORRECT - metrics flow through automatically - ---- - -## Missing Functionality - -### Critical Gaps - -**1. No ByteBuffer[] Support** -**Missing:** -```java -boolean[] matchAll(ByteBuffer[] buffers) -``` -**Use case:** Cassandra returns ByteBuffer[] from multi-column queries -**Impact:** Can't do bulk zero-copy on array of ByteBuffers - -**2. No Zero-Copy Filter Operations** -**Missing:** -```java -List filter(ByteBuffer[] inputs) // Filter matching buffers -List filterNot(ByteBuffer[] inputs) -``` -**Use case:** Filter array of ByteBuffers from Cassandra -**Impact:** Must convert to String[], losing zero-copy benefit - -**3. No findAll Bulk for Zero-Copy** -**Current:** Only have `findAll(long[], int[])` which is partial match -**Missing:** -```java -// Note: We DO have matchAll(long[], int[]) for full match -// And we have findAll(long[], int[]) for partial match on bulk -// So actually this might be complete? -``` - -Let me verify findAll coverage... - ---- - -## Verification Needed - -### findAll Coverage Check - -**Current methods:** -- ✅ `findAll(long[], int[])` - bulk partial match with zero-copy - -**Question:** Do we need a String[] variant? -```java -boolean[] findAll(String[] inputs) // Partial match on each string -``` - -Looking at Pattern methods, I don't see a String[] variant of findAll. Only: -- matchAll(String[]) - full match on array ✅ -- findAll(long[], int[]) - partial match on address array ✅ - -**Missing String variant:** -```java -boolean[] findAll(String[] inputs) // Partial match bulk -``` - -This would use `partialMatchBulk` native method which exists! - ---- - -## Assessment - -### What's Correct ✅ - -1. **Core bulk matching:** matchAll with String[], Collection, address arrays - ✅ -2. **Metrics instrumentation:** All use Global + Specific pattern - ✅ -3. **Per-item latency:** Consistent across all bulk operations - ✅ -4. **Delegation:** All filter/map/retain methods delegate correctly - ✅ - -### What's Missing ❌ - -**High Priority:** -1. **ByteBuffer[] matchAll** - bulk with array of ByteBuffers -2. **findAll(String[])** - partial match bulk (native method exists, just need Java wrapper) -3. **ByteBuffer filter operations** - filter(ByteBuffer[]), filterNot(ByteBuffer[]) - -**Medium Priority:** -4. Zero-copy variants of filter/map operations (lower priority - delegation works) - -**Low Priority:** -5. Map variants (edge case, probably not needed) - ---- - -## Recommendation - -**Add these 3 methods to complete Phase 1:** - -1. **`boolean[] findAll(String[] inputs)`** - Easy, native method exists - ```java - boolean[] results = RE2NativeJNI.partialMatchBulk(nativeHandle, inputs); - // Add MATCHING_BULK_* metrics - ``` - -2. **`boolean[] matchAll(ByteBuffer[] buffers)`** - Important for Cassandra - ```java - // Extract addresses from buffers, call matchAll(long[], int[]) - ``` - -3. **`boolean[] findAll(ByteBuffer[] buffers)`** - Consistency - ```java - // Extract addresses from buffers, call findAll(long[], int[]) - ``` - -**Estimated:** ~30k tokens to add these + tests - ---- - -## Shall I add these now (during native build wait)? diff --git a/PHASE1_TEST_INVENTORY.md b/PHASE1_TEST_INVENTORY.md new file mode 100644 index 0000000..4132f03 --- /dev/null +++ b/PHASE1_TEST_INVENTORY.md @@ -0,0 +1,368 @@ +# Phase 1: Test Inventory and Analysis Report + +**Date:** 2025-11-26 +**Branch:** testconsolidation +**Purpose:** Comprehensive inventory of all Maven modules, test directories, test files, and dependencies + +--- + +## Executive Summary + +**Total Test Classes:** 30 +**Total Test Methods:** 459 +**Maven Modules:** 2 (libre2-core, libre2-dropwizard) +**Lines of Test Code:** ~5,761 lines + +### Current State +- All tests located in standard `src/test/java` directories +- No integration test separation (everything runs via Surefire) +- Performance/stress tests mixed with unit tests +- Dropwizard dependencies present in core module (provided scope, optional) +- No code coverage tooling configured +- No static analysis (Checkstyle, etc.) configured + +--- + +## Module Structure + +### 1. libre2-parent (Root POM) +- **Type:** Parent aggregator POM +- **Packaging:** pom +- **Version:** 1.0.0 +- **Modules:** + - libre2-core + - libre2-dropwizard +- **Test Framework:** JUnit 5.10.0 +- **Test Assertion:** AssertJ 3.24.2 +- **Build Plugins:** + - maven-compiler-plugin 3.11.0 (Java 17) + - maven-surefire-plugin 3.1.2 (unit tests only) + - No Failsafe plugin (no integration test separation) + - No JaCoCo (no coverage) + - No Checkstyle (no static analysis) + +### 2. libre2-core Module +- **Type:** Core library +- **Packaging:** jar +- **Location:** `libre2-core/` +- **Purpose:** Core RE2 bindings, Pattern/Matcher API, caching, metrics +- **Dropwizard Usage:** + - `metrics-core` (provided scope, optional) in main code + - Used by `DropwizardMetricsAdapter.java` + - 13 Dropwizard imports in test code +- **Test Source Directory:** `libre2-core/src/test/java` +- **Test Classes:** 27 +- **Test Methods:** 441 + +#### Test Packages in libre2-core: +``` +com.axonops.libre2 +├── api/ # 7 test classes, 148 tests +│ ├── BulkMatchingPerformanceTest.java (3 tests) [PERFORMANCE] +│ ├── BulkMatchingTest.java (47 tests) [INTEGRATION] +│ ├── BulkMatchingTypeSafetyTest.java (13 tests) [UNIT] +│ ├── ByteBufferApiTest.java (13 tests) [INTEGRATION] +│ ├── CaptureGroupsTest.java (31 tests) [INTEGRATION] +│ ├── Phase1ExtensionsTest.java (15 tests) [INTEGRATION] +│ └── ReplaceOperationsTest.java (26 tests) [INTEGRATION] +├── cache/ # 14 test classes, 98 tests +│ ├── CacheFullInUseTest.java (6 tests) [INTEGRATION] +│ ├── CachePerformanceTest.java (4 tests) [PERFORMANCE] +│ ├── CacheTest.java (12 tests) [INTEGRATION] +│ ├── ConcurrencyTest.java (7 tests) [STRESS] +│ ├── ConcurrentCleanupTest.java (4 tests) [STRESS] +│ ├── ConfigurationTest.java (14 tests) [UNIT] +│ ├── DeferredCleanupTimingTest.java (3 tests) [INTEGRATION] +│ ├── EvictionEdgeCasesTest.java (6 tests) [INTEGRATION] +│ ├── EvictionWhileInUseTest.java (6 tests) [STRESS] +│ ├── IdleEvictionTest.java (5 tests) [INTEGRATION] +│ ├── NativeMemoryTrackingTest.java (17 tests) [INTEGRATION] +│ ├── ResourceLimitConfigurationTest.java (5 tests) [INTEGRATION] +│ ├── StressTest.java (4 tests) [STRESS] +│ └── ThreadSafetyTest.java (5 tests) [STRESS] +├── jni/ # 1 test class, 48 tests +│ └── RE2NativeJNITest.java (48 tests) [INTEGRATION] +├── metrics/ # 4 test classes, 27 tests +│ ├── ComprehensiveMetricsTest.java (9 tests) [INTEGRATION] +│ ├── MetricsIntegrationTest.java (9 tests) [INTEGRATION] +│ ├── NativeMemoryMetricsTest.java (5 tests) [INTEGRATION] +│ └── TimerHistogramTest.java (4 tests) [UNIT] +├── test/ # 1 helper class +│ └── TestUtils.java (test helper, not a test) +└── RE2Test.java # 1 test class, 77 tests [INTEGRATION] +``` + +### 3. libre2-dropwizard Module +- **Type:** Dropwizard Metrics integration +- **Packaging:** jar +- **Location:** `libre2-dropwizard/` +- **Purpose:** JMX integration, MetricRegistry wiring for Cassandra/Dropwizard apps +- **Dependencies:** + - libre2-core (compile scope) + - metrics-jmx (compile scope) +- **Test Source Directory:** `libre2-dropwizard/src/test/java` +- **Test Classes:** 3 +- **Test Methods:** 18 + +#### Test Packages in libre2-dropwizard: +``` +com.axonops.libre2.dropwizard/ +├── JmxIntegrationTest.java (6 tests) [INTEGRATION] +├── MetricsEndToEndTest.java (6 tests) [INTEGRATION] +└── RE2MetricsConfigTest.java (6 tests) [UNIT] +``` + +--- + +## Test Classification Analysis + +### Test Types Breakdown + +| Test Type | Count | Test Methods | Current Location | Target Location | +|-----------|-------|--------------|------------------|-----------------| +| **Unit Tests** | 4 | ~40 | src/test/java | src/test/java | +| **Integration Tests** | 20 | ~370 | src/test/java | src/integration-test/java | +| **Performance Tests** | 2 | ~7 | src/test/java | perf-test module | +| **Stress Tests** | 4 | ~24 | src/test/java | perf-test module | +| **Helper Classes** | 1 | N/A | src/test/java | src/test/java | + +#### Classification Rationale: + +**Unit Tests** (No native code, mockable): +- `BulkMatchingTypeSafetyTest.java` - Type checking, no native calls +- `ConfigurationTest.java` - Configuration validation +- `TimerHistogramTest.java` - Pure Java histogram logic +- `RE2MetricsConfigTest.java` - Config builder validation + +**Integration Tests** (Native code interaction, multi-component): +- All API tests (BulkMatchingTest, CaptureGroupsTest, etc.) - use native RE2 +- Most cache tests (CacheTest, EvictionEdgeCasesTest, etc.) - cache + native +- JNI layer test (RE2NativeJNITest) - direct JNI validation +- Metrics tests (ComprehensiveMetricsTest, etc.) - metrics + native +- Dropwizard tests (JmxIntegrationTest, etc.) - JMX + metrics + native +- RE2Test.java - high-level API validation + +**Performance Tests** (Benchmarking, throughput measurement): +- `BulkMatchingPerformanceTest.java` - Bulk vs individual timing +- `CachePerformanceTest.java` - Cache lookup performance + +**Stress Tests** (High load, concurrency, resource limits): +- `ConcurrencyTest.java` - Concurrent pattern compilation +- `ConcurrentCleanupTest.java` - Concurrent cleanup safety +- `StressTest.java` - 100 threads, 1000 ops/thread +- `EvictionWhileInUseTest.java` - Race condition testing + +--- + +## Dependency Analysis + +### Cross-Module Dependencies + +**libre2-dropwizard → libre2-core:** +- Compile dependency (required) +- Imports: `Pattern`, `PatternCache`, `RE2Config`, `Matcher` +- Test dependency: Uses `Pattern` API for validation + +**libre2-core → Dropwizard Metrics:** +- **Scope:** `provided` (optional) +- **Location:** `DropwizardMetricsAdapter.java` +- **Impact:** Core can work without Dropwizard if metrics not used +- **Issue:** Optional dependency but integrated into core module + +### Dropwizard Usage in Core Module + +**Production Code:** +``` +libre2-core/src/main/java/com/axonops/libre2/metrics/DropwizardMetricsAdapter.java + - imports: com.codahale.metrics.{Gauge, MetricRegistry} + - Scope: Optional, provided scope + - Used by: RE2MetricsRegistry when MetricRegistry provided +``` + +**Test Code (13 imports):** +``` +Metrics tests (ComprehensiveMetricsTest, MetricsIntegrationTest, etc.) + - Use MetricRegistry for validation + - Could potentially use interface/abstraction instead +``` + +### Native Code Dependencies + +**All integration tests depend on:** +- Native RE2 library (`libre2.dylib`/`libre2.so`) +- JNI layer (`RE2NativeJNI.java`) +- Platform-specific native loading + +**Cannot be unit tested without:** +- Mocking JNI layer +- Abstracting native calls +- Interface-based design + +--- + +## Test Infrastructure + +### Test Utilities +- **Location:** `libre2-core/src/test/java/com/axonops/libre2/test/TestUtils.java` +- **Purpose:** + - Test cache setup/teardown + - Global cache replacement for test isolation + - Configuration builders for tests +- **Lines:** ~200 lines +- **Usage:** Used by most integration tests + +### Test Configuration +- **Logging:** Logback configured via `src/test/resources/logback-test.xml` +- **Cache Settings:** Tests use custom configs (smaller cache sizes, shorter timeouts) +- **JMX:** Tests disable JMX by default to prevent `InstanceAlreadyExistsException` + +--- + +## Current Test Execution + +### Maven Surefire Configuration +```xml + + org.apache.maven.plugins + maven-surefire-plugin + 3.1.2 + + + + --add-exports=java.base/sun.nio.ch=ALL-UNNAMED + + + +``` + +### Build Commands +- **Compile:** `mvn compile` (compiles all modules) +- **Test:** `mvn test` (runs all tests via Surefire, no separation) +- **Package:** `mvn package` (includes test execution) + +### Current Issues +1. **No test type separation**: All tests run together via Surefire +2. **Long test execution**: Performance/stress tests slow down CI +3. **No integration test phase**: Cannot skip integration tests separately +4. **No coverage**: No JaCoCo or similar tool configured +5. **No static analysis**: No Checkstyle, SpotBugs, etc. + +--- + +## Problems Identified + +### 1. Test Organization +- **Issue:** Performance tests mixed with unit tests +- **Impact:** Cannot skip slow tests in CI +- **Recommendation:** Move to separate `perf-test` module + +### 2. Integration Test Separation +- **Issue:** Integration tests run via Surefire (unit test plugin) +- **Impact:** Cannot run unit tests independently of native library +- **Recommendation:** Use Failsafe plugin, move to `src/integration-test/java` + +### 3. Dropwizard in Core +- **Issue:** Core module has optional Dropwizard dependency +- **Impact:** Users who don't use Dropwizard still pull it in (provided scope) +- **Complexity:** Adapter is tightly integrated into core metrics +- **Recommendation:** + - Option A: Keep as-is (provided scope means users supply it) + - Option B: Extract to separate module (invasive refactoring) + - **Decision needed:** Is current separation sufficient? + +### 4. Unit Test Coverage +- **Issue:** Only ~4 true unit tests (no native dependency) +- **Impact:** Cannot test business logic without native library present +- **Recommendation:** + - Create interface for JNI layer + - Mock native calls for pure unit tests + - Test configuration, validation, error handling in isolation + +### 5. No Code Coverage Metrics +- **Issue:** No coverage tool configured +- **Impact:** Cannot track test coverage, no enforcement +- **Recommendation:** Add JaCoCo with threshold enforcement + +### 6. No Static Analysis +- **Issue:** No style checker, no static analysis +- **Impact:** Code style inconsistencies, potential bugs +- **Recommendation:** Add Checkstyle with Google Java Style + +--- + +## Recommendations for Phase 2 + +### New Module Structure +``` +libre2-java/ +├── pom.xml (parent, adds JaCoCo + Checkstyle) +├── libre2-core/ (core library) +│ ├── src/main/java/ (production code) +│ ├── src/test/java/ (unit tests only - 4 tests) +│ └── src/integration-test/java/ (integration tests - ~370 tests) +├── libre2-dropwizard/ (Dropwizard integration) +│ ├── src/main/java/ +│ ├── src/test/java/ (unit tests) +│ └── src/integration-test/java/ (integration tests) +└── perf-test/ (NEW MODULE) + ├── pom.xml (depends on libre2-core) + ├── src/test/java/ (performance + stress tests) + │ ├── performance/ (BulkMatchingPerformanceTest, etc.) + │ └── stress/ (StressTest, ConcurrencyTest, etc.) + └── README.md (how to run, what to expect) +``` + +### Test Migration Plan + +**Move to perf-test module:** +- `BulkMatchingPerformanceTest.java` → `perf-test/src/test/java/performance/` +- `CachePerformanceTest.java` → `perf-test/src/test/java/performance/` +- `StressTest.java` → `perf-test/src/test/java/stress/` +- `ConcurrencyTest.java` → `perf-test/src/test/java/stress/` +- `ConcurrentCleanupTest.java` → `perf-test/src/test/java/stress/` +- `EvictionWhileInUseTest.java` → `perf-test/src/test/java/stress/` + +**Move to src/integration-test/java:** +- All API tests (except BulkMatchingTypeSafetyTest) +- Most cache tests (except ConfigurationTest) +- JNI layer test (RE2NativeJNITest) +- Metrics tests (except TimerHistogramTest) +- Dropwizard tests (except RE2MetricsConfigTest) +- RE2Test.java + +**Keep in src/test/java (unit tests):** +- `BulkMatchingTypeSafetyTest.java` +- `ConfigurationTest.java` +- `TimerHistogramTest.java` +- `RE2MetricsConfigTest.java` +- `TestUtils.java` (helper) + +--- + +## Next Steps (Awaiting Approval) + +### Before Proceeding to Phase 2: +1. **Review this inventory** - Confirm test classification is accurate +2. **Decide on Dropwizard** - Keep in core (provided) or extract? +3. **Approve migration plan** - Confirm target structure is acceptable +4. **Confirm no logic changes** - Phase 2 will only move files, not edit tests + +### After Approval: +- Create `testconsolidation/phase-2-migration` branch +- Create `perf-test` module +- Create `src/integration-test/java` directories +- Migrate tests (file moves only, no edits) +- Configure Failsafe plugin for integration tests +- Verify all 459 tests still pass + +--- + +## Token Usage Report + +**Phase 1 Token Usage:** ~12,000 tokens (approximately) +**Remaining Budget:** ~945,000 tokens + +--- + +**End of Phase 1 Inventory** +**Status:** ✅ COMPLETE - Awaiting user review and approval to proceed to Phase 2 diff --git a/PHASE_123_REMEDIATION_PLAN.md b/PHASE_123_REMEDIATION_PLAN.md deleted file mode 100644 index c6bcc4d..0000000 --- a/PHASE_123_REMEDIATION_PLAN.md +++ /dev/null @@ -1,261 +0,0 @@ -# Phase 1/2/3 Remediation Plan - -**Created:** 2025-11-25 -**Updated:** 2025-11-25 -**Status:** METRICS DEFINED - Implementation needed - -**Token Usage:** ~425k / 1M (42%) - ---- - -## Executive Summary - -Phases 1/2/3 were implemented incompletely: -- ❌ No metrics tracking (zero observability) -- ❌ No zero-copy for Phase 2/3 (capture groups, replace) -- ❌ Incomplete bulk operations -- ❌ Empty RE2.java entry point - -**Remediation Required:** ~8-12 hours of systematic work to bring to production quality - ---- - -## Critical Issues Identified - -### 1. No Metrics Tracking (CRITICAL) -**Problem:** Phase 1/2/3 methods have ZERO metrics instrumentation -**Impact:** No observability into new functionality usage, latencies, or performance - -**Missing metrics in:** -- Phase 1: All bulk matching methods (matchAll, filter, etc.) - NO metrics -- Phase 2: All capture group methods (match, find, findAll) - NO metrics -- Phase 3: All replace methods (replaceFirst, replaceAll) - NO metrics - -**What's needed:** -- Add operation counters to every method -- Add latency timers to every method -- Add item counters for bulk operations -- Follow existing Matcher.matches() pattern exactly - -### 2. Missing Granular Metrics (CRITICAL) -**Problem:** Can't distinguish workload types -**Impact:** Can't tell if users are doing matching vs capture vs replace - -**What's needed:** -- MATCHING_OPERATIONS, MATCHING_BULK_OPERATIONS, MATCHING_BULK_ITEMS -- CAPTURE_OPERATIONS, CAPTURE_FINDALL_OPERATIONS, CAPTURE_FINDALL_MATCHES -- REPLACE_OPERATIONS, REPLACE_BULK_OPERATIONS, REPLACE_BULK_ITEMS -- MATCHING_ZERO_COPY_OPERATIONS, MATCHING_DIRECT_BUFFER_OPERATIONS -- Separate latency timers for each operation type - -✅ **Status:** Met - -ricNames.java updated with 18 new metric constants - -### 3. Missing Zero-Copy Variants (CRITICAL) -**Problem:** Phase 2/3 only have String APIs, no ByteBuffer/address overloads -**Impact:** Users can't use zero-copy for capture groups or replace operations - -**Missing methods:** -```java -// Phase 2 - Capture Groups Zero-Copy -MatchResult match(ByteBuffer buffer) -MatchResult match(long address, int length) -MatchResult find(ByteBuffer buffer) -MatchResult find(long address, int length) -List findAll(ByteBuffer buffer) -List findAll(long address, int length) - -// Phase 3 - Replace Zero-Copy -String replaceFirst(ByteBuffer input, String replacement) -String replaceFirst(long inputAddress, int inputLength, String replacement) -String replaceAll(ByteBuffer input, String replacement) -String replaceAll(long inputAddress, int inputLength, String replacement) -``` - -**What's needed:** -- Add ByteBuffer overloads with isDirect() routing -- Add (long address, int length) overloads -- Add bulk variants with address/length arrays - -### 4. Missing Proper Bulk Operations (HIGH) -**Problem:** Phase 2/3 bulk operations incomplete or missing - -**Current state:** -- Phase 1: ✅ Has proper bulk (matchAll with arrays/collections) -- Phase 2: ❌ No bulk capture group extraction -- Phase 3: ⚠️ Has replaceAll(String[], String) but no Collection variant done properly - -**What's needed:** -```java -// Phase 2 bulk -MatchResult[] matchAll(String[] inputs) // Extract groups from each -MatchResult[] matchAll(Collection inputs) -List findInEach(String[] inputs) // Find first match in each -List> findAllInEach(String[] inputs) // Find all in each - -// Phase 3 bulk (already has arrays, need collections) -List replaceFirst(Collection inputs, String replacement) -``` - -### 5. Empty RE2.java Entry Point (HIGH) -**Problem:** RE2.java only has compile() and matches() - should have ALL convenience methods - -**What's missing:** -```java -// Should mirror Pattern but as static convenience methods -static boolean find(String pattern, String input) -static MatchResult match(String pattern, String input) -static List findAll(String pattern, String input) -static String replaceFirst(String pattern, String input, String replacement) -static String replaceAll(String pattern, String input, String replacement) - -// ByteBuffer variants -static boolean matches(String pattern, ByteBuffer input) -static boolean find(String pattern, ByteBuffer input) -// etc. -``` - -### 6. MatchResult Resource Management (MEDIUM - Needs Review) -**Question:** Should MatchResult implement AutoCloseable? - -**Current:** MatchResult is immutable data container with String[] groups -- NO native resources -- NO pattern references that need cleanup -- Just Strings and a Map - -**Analysis:** MatchResult does NOT need AutoCloseable because: -- Doesn't hold native resources -- Doesn't increment Pattern refCount -- Is a simple immutable value object -- Strings are GC-managed - -**Conclusion:** MatchResult is fine as-is. It's a data holder, not a resource holder. - ---- - -## Remediation Approach - -### Strategy 1: Incremental Fix (Recommended) -Fix issues in place on existing feature branches: - -1. **Metrics First** (1-2 hours) - - Update all Phase 1/2/3 methods to track metrics - - Test metrics are recorded correctly - - Commit to existing branches - -2. **Zero-Copy Second** (2-3 hours) - - Add ByteBuffer overloads for Phase 2/3 - - Add address/length overloads for Phase 2/3 - - Test zero-copy variants - - Commit to existing branches - -3. **Bulk Operations Third** (1-2 hours) - - Add missing bulk variants for Phase 2 - - Complete bulk for Phase 3 - - Test bulk operations - - Commit to existing branches - -4. **RE2.java Fourth** (1 hour) - - Add all convenience methods - - Test RE2.java methods - - Commit to development - -**Total Time:** ~6-8 hours -**Advantage:** Incremental, testable, preserves git history -**Disadvantage:** Multiple commits to fix mistakes - -### Strategy 2: Rewrite (Nuclear Option) -Delete Phase 1/2/3 branches and start over: - -**Total Time:** ~12-16 hours -**Advantage:** Clean implementation from start -**Disadvantage:** Loses work, demoralizing - ---- - -## Recommended: Strategy 1 (Incremental Fix) - -Fix existing implementation incrementally with clear commits showing remediation. - ---- - -## Detailed Fix Checklist - -### Fix 1: Add Metrics to All Methods - -**Phase 1 methods needing metrics:** -- [ ] `matchAll(Collection)` - add bulk counters, latency, item count -- [ ] `matchAll(String[])` - add metrics -- [ ] `filter()`, `filterNot()` - add metrics (same as matchAll) -- [ ] `filterByKey()`, `filterByValue()` - add metrics -- [ ] `retainMatches()`, `removeMatches()` - add metrics -- [ ] All map filtering variants - add metrics - -**Phase 2 methods needing metrics:** -- [ ] `match(String)` - add CAPTURE_OPERATIONS, CAPTURE_LATENCY -- [ ] `find(String)` - add metrics -- [ ] `findAll(String)` - add CAPTURE_FINDALL_OPERATIONS, CAPTURE_FINDALL_MATCHES - -**Phase 3 methods needing metrics:** -- [ ] `replaceFirst(String, String)` - add REPLACE_OPERATIONS, REPLACE_LATENCY -- [ ] `replaceAll(String, String)` - add metrics -- [ ] `replaceAll(String[], String)` - add REPLACE_BULK_OPERATIONS, REPLACE_BULK_ITEMS, REPLACE_BULK_LATENCY -- [ ] `replaceAll(Collection, String)` - add metrics - -### Fix 2: Add Zero-Copy Variants - -**Phase 2 zero-copy:** -- [ ] `MatchResult match(ByteBuffer)` -- [ ] `MatchResult match(long, int)` -- [ ] `MatchResult find(ByteBuffer)` -- [ ] `MatchResult find(long, int)` -- [ ] `List findAll(ByteBuffer)` -- [ ] `List findAll(long, int)` - -**Phase 3 zero-copy:** -- [ ] `String replaceFirst(ByteBuffer, String)` -- [ ] `String replaceFirst(long, int, String)` -- [ ] `String replaceAll(ByteBuffer, String)` -- [ ] `String replaceAll(long, int, String)` -- [ ] Bulk variants with address arrays - -### Fix 3: Add Missing Bulk Operations - -**Phase 2 bulk:** -- [ ] `MatchResult[] matchAll(String[])` -- [ ] `MatchResult[] matchAll(Collection)` -- [ ] `List findInEach(String[])` -- [ ] `List> findAllInEach(String[])` - -**Phase 3 bulk:** -- [ ] `List replaceFirst(Collection, String)` (if needed) - -### Fix 4: Complete RE2.java - -- [ ] Add all convenience static methods mirroring Pattern -- [ ] Add tests for RE2.java -- [ ] Ensure proper Pattern lifecycle (compile/close) - ---- - -## Next Steps - -1. **Review and approve** this remediation plan -2. **Prioritize** which fixes are must-have vs nice-to-have -3. **Execute** incrementally with tests after each fix -4. **Update progress tracker** honestly about remediation work - ---- - -## Honest Assessment - -I apologize for the rushed implementation. The user feedback is correct: -- ✅ Native layer (Phase 0) was done properly -- ❌ Java layer (Phase 1/2/3) was incomplete -- ❌ Didn't follow existing Matcher/Pattern patterns -- ❌ No metrics tracking -- ❌ No zero-copy variants -- ❌ Incomplete bulk operations - -**Proper approach:** Fix systematically, test thoroughly, follow established patterns. diff --git a/RE2_GAP_IMPLEMENTATION.md b/RE2_GAP_IMPLEMENTATION.md deleted file mode 100644 index ef3273c..0000000 --- a/RE2_GAP_IMPLEMENTATION.md +++ /dev/null @@ -1,382 +0,0 @@ -# RE2 Feature Gap Implementation Plan - -## Executive Summary - -This document outlines the implementation plan for adding missing RE2 features to libre2-java. The goal is to provide comprehensive regex functionality including bulk operations, capture groups, replace operations, and utilities - all with both single-string and batch APIs. - -**Current State:** Basic Pattern/Matcher with fullMatch/partialMatch only -**Target State:** Full-featured RE2 wrapper with parity to java.util.regex -**Estimated Effort:** 8-10 days -**Risk Level:** Medium (native code changes, extensive testing required) - ---- - -## Feature Gap Analysis - -### What We Have (0.9.1) -- ✅ Pattern compilation (case-sensitive/insensitive) -- ✅ Basic matching (fullMatch, partialMatch) -- ✅ Pattern caching with dual eviction (LRU + idle) -- ✅ Metrics integration (25 metrics) -- ✅ Resource tracking and safety -- ✅ Thread-safe operations - -### What's Missing - -#### 1. Bulk Operations (High Priority) -**Gap:** Every match requires a JNI call (50ns overhead) -**Impact:** 10,000 matches = 500μs wasted in JNI overhead -**Solution:** Batch API that processes arrays in single JNI call - -**Missing APIs:** -- `boolean[] matches(Collection)` - bulk matching -- `List filter(Collection)` - bulk filtering -- Map variants for key/value filtering -- In-place filtering (retainMatches/removeMatches) - -#### 2. Capture Groups (High Priority) -**Gap:** Cannot extract parts of matched text -**Impact:** Users cannot parse structured data (emails, phone numbers, etc.) -**Solution:** MatchResult class with group extraction - -**Missing APIs:** -- `MatchResult match(String)` - single match with groups -- `List findAll(String)` - all matches with groups -- Named group support -- Batch capture group extraction - -#### 3. Replace Operations (Medium Priority) -**Gap:** Cannot perform find/replace with regex -**Impact:** Users must use java.util.regex for data cleaning -**Solution:** Replace methods with backreference support - -**Missing APIs:** -- `String replaceFirst(String, String)` - replace first match -- `String replaceAll(String, String)` - replace all matches -- Backreference support ($1, $2, etc.) -- Batch replace operations - -#### 4. Utilities (Low Priority) -**Gap:** Missing helper functions available in RE2 -**Impact:** Users manually escape regex characters -**Solution:** Static utility methods - -**Missing APIs:** -- `Pattern.quoteMeta(String)` - escape special characters -- `programSize()` - pattern complexity measurement -- `programFanout()` - DFA analysis - ---- - -## Implementation Phases - -### Phase 0: Native Foundation (2 days) -**Goal:** Add all required JNI methods before touching Java layer - -**Deliverables:** -1. C++ implementation of bulk matching -2. C++ implementation of capture group extraction -3. C++ implementation of replace operations -4. C++ implementation of utility functions -5. JNI method signatures in RE2NativeJNI.java -6. Header file generation (`com_axonops_libre2_jni_RE2NativeJNI.h`) -7. Build verification on all platforms - -**Branch:** `feature/re2-native-extensions` -**Merge into:** `development` (after all native methods work) - -**Risk:** Native compilation failures on ARM64, cross-platform compatibility - ---- - -### Phase 1: Bulk Matching API (1.5 days) -**Goal:** Minimize JNI overhead for high-throughput use cases - -**Deliverables:** -1. `boolean[] matches(Collection)` and array variant -2. `List filter(Collection)` -3. `List filterNot(Collection)` -4. Map filtering: `filterByKey`, `filterByValue`, `filterNotByKey`, `filterNotByValue` -5. In-place filtering: `retainMatches`, `removeMatches` (Collection and Map variants) -6. Comprehensive tests (correctness + performance benchmarks) -7. Metrics integration (bulk operation counters) -8. Documentation in Pattern.java Javadoc - -**Branch:** `feature/bulk-matching` -**Merge into:** `development` - -**Dependencies:** Phase 0 complete - ---- - -### Phase 2: Capture Groups (2 days) -**Goal:** Enable structured data extraction from matches - -**Deliverables:** -1. `MatchResult` class with group access -2. `MatchResult match(String)` - single match -3. `MatchResult find(String)` - find first -4. `List findAll(String)` - find all -5. Named group support (`group(String name)`) -6. Batch variants: `matchWithGroups`, `findInEach`, `findAllInEach` -7. Position tracking (start/end indices) -8. Comprehensive tests (all group extraction scenarios) -9. Documentation and usage examples - -**Branch:** `feature/capture-groups` -**Merge into:** `development` - -**Dependencies:** Phase 0 complete - ---- - -### Phase 3: Replace Operations (1.5 days) -**Goal:** Enable regex-based find/replace - -**Deliverables:** -1. `String replaceFirst(String, String)` - replace first match -2. `String replaceAll(String, String)` - replace all matches -3. Backreference support ($1, $2, etc.) via RE2::Rewrite -4. `replaceAll(String, Function)` - custom replacer -5. Batch variants: `replaceFirstInEach`, `replaceAllInEach` -6. Comprehensive tests (literal replacement, backreferences, edge cases) -7. Documentation and usage examples - -**Branch:** `feature/replace-operations` -**Merge into:** `development` - -**Dependencies:** Phase 0 complete, Phase 2 helpful (for custom replacer) - ---- - -### Phase 4: Utilities (0.5 days) -**Goal:** Add helper functions for common operations - -**Deliverables:** -1. `static String quoteMeta(String)` - escape special chars -2. `static String[] quoteMeta(Collection)` - batch escape -3. `long programSize()` - pattern complexity -4. `Map programFanout()` - DFA analysis -5. Tests and documentation - -**Branch:** `feature/utilities` -**Merge into:** `development` - -**Dependencies:** Phase 0 complete - ---- - -### Phase 5: Integration & Polish (1 day) -**Goal:** Ensure all features work together, comprehensive testing - -**Deliverables:** -1. Integration tests (combining multiple features) -2. Performance benchmarks (bulk vs single-string) -3. Update QUICKSTART.md with all new features -4. Update libre2-core/README.md -5. Verify all 187+ tests still pass -6. Update metrics documentation (if new metrics added) -7. Final code review and cleanup - -**Branch:** `feature/integration-polish` -**Merge into:** `development` - -**Dependencies:** Phases 0-4 complete - ---- - -### Phase 6: Documentation & Release (0.5 days) -**Goal:** Prepare for 1.0.0 release - -**Deliverables:** -1. CHANGELOG.md update -2. Version bump to 1.0.0 -3. Comprehensive Javadoc review -4. Migration guide (0.9.x → 1.0.0) -5. Tag release -6. Prepare for Maven Central deployment - -**Branch:** `release/1.0.0` -**Merge into:** `main` and `development` - -**Dependencies:** Phase 5 complete - ---- - -## Implementation Order Summary - -``` -Phase 0: Native Foundation (2 days) - ├─ Bulk matching JNI (fullMatchBulk, partialMatchBulk) - ├─ Capture groups JNI (extractGroups, extractGroupsBulk) - ├─ Replace JNI (replace, replaceAll, replaceAllBulk) - └─ Utilities JNI (quoteMeta, programSize, programFanout) - -Phase 1: Bulk Matching (1.5 days) - ├─ Collection matching/filtering - └─ Map filtering variants - -Phase 2: Capture Groups (2 days) - ├─ MatchResult class - ├─ Single-string APIs - └─ Batch APIs - -Phase 3: Replace Operations (1.5 days) - ├─ Single-string replace - └─ Batch replace - -Phase 4: Utilities (0.5 days) - └─ Static helper methods - -Phase 5: Integration & Polish (1 day) - └─ Testing, docs, benchmarks - -Phase 6: Documentation & Release (0.5 days) - └─ Release prep for 1.0.0 -``` - -**Total Estimated Time:** 9 days - ---- - -## Risk Assessment - -### High Risk Items -1. **Native Code Compatibility** - - Risk: Compilation failures on ARM64, different platforms - - Mitigation: Test on all CI platforms after each native change - -2. **JNI Memory Management** - - Risk: Memory leaks in bulk operations (large arrays) - - Mitigation: Careful use of JStringGuard, local ref cleanup - -3. **Capture Group Complexity** - - Risk: RE2's C++ API for groups is complex (submatch handling) - - Mitigation: Start with simple cases, extensive testing - -### Medium Risk Items -4. **Performance Regressions** - - Risk: New code slows down existing paths - - Mitigation: Benchmark suite, compare before/after - -5. **API Design Changes** - - Risk: Discover better API during implementation - - Mitigation: Review design before coding each phase - -### Low Risk Items -6. **Documentation Lag** - - Risk: Code complete but docs missing - - Mitigation: Write docs as features are implemented - ---- - -## Success Criteria - -### Functional Requirements -- ✅ All 31 new methods implemented and tested -- ✅ MatchResult class fully functional -- ✅ All tests pass (target: 240+ tests) -- ✅ No memory leaks (verified with long-running tests) -- ✅ Thread-safe (verified with concurrency tests) - -### Performance Requirements -- ✅ Bulk matching 10-20x faster than individual calls -- ✅ Capture groups <10% overhead vs basic matching -- ✅ Replace operations comparable to java.util.regex - -### Quality Requirements -- ✅ Comprehensive Javadoc for all new APIs -- ✅ Usage examples in QUICKSTART.md -- ✅ Clean build on all platforms -- ✅ No new compiler warnings -- ✅ Metrics properly tracking new operations - ---- - -## Branch Strategy - -``` -main (stable releases) - └─ development (active development) - ├─ feature/re2-native-extensions (Phase 0) - ├─ feature/bulk-matching (Phase 1) - ├─ feature/capture-groups (Phase 2) - ├─ feature/replace-operations (Phase 3) - ├─ feature/utilities (Phase 4) - ├─ feature/integration-polish (Phase 5) - └─ release/1.0.0 (Phase 6) -``` - -**Merge Strategy:** -- Each feature branch merges to `development` after completion -- All tests must pass before merge -- Code review required for native code changes -- `development` merges to `main` for releases only - ---- - -## Testing Strategy - -### Unit Tests (Per Phase) -- Correctness tests for each new method -- Edge cases (empty input, null handling, invalid patterns) -- Error conditions (compilation failures, invalid groups) - -### Integration Tests (Phase 5) -- Combining bulk + capture groups -- Combining replace + capture groups -- End-to-end workflows - -### Performance Tests (Phase 5) -- Bulk matching vs single-string (expect 10-20x improvement) -- Memory usage under load -- Concurrency stress tests - -### Platform Tests (Continuous) -- Linux x86_64, ARM64 -- macOS x86_64, Apple Silicon -- Run on every commit via GitHub Actions - ---- - -## Rollback Plan - -If a phase fails or introduces critical bugs: - -1. **Immediate:** Revert merge commit from `development` -2. **Analysis:** Identify root cause on feature branch -3. **Fix:** Implement fix on feature branch -4. **Re-test:** Full test suite passes -5. **Re-merge:** Merge corrected branch to `development` - -**Critical Path:** Native code changes (Phase 0) are highest risk. If Phase 0 fails, all subsequent phases blocked. - ---- - -## Dependencies - -### External Dependencies (No Changes) -- JNA 5.13.0 (provided by host application) -- SLF4J 1.7+ (logging) -- Dropwizard Metrics 4.2.19 (optional, metrics) -- JUnit 5 (testing) - -### Internal Dependencies -- RE2 native library (no version change, using existing API) -- Existing Pattern/Matcher classes (extend, don't break) -- Existing metrics infrastructure (add new metrics) - -### Build Dependencies -- Maven 3.8+ -- Java 17 -- Native compilation toolchain (gcc/clang) - ---- - -## Notes - -- All implementations must maintain backward compatibility with 0.9.x API -- Existing tests (187 tests) must continue passing -- New features are additive only - no breaking changes -- Focus on correctness first, optimization second -- Extensive Javadoc required for all public APIs diff --git a/RE2_GAP_PROGRESS.md b/RE2_GAP_PROGRESS.md deleted file mode 100644 index 411b669..0000000 --- a/RE2_GAP_PROGRESS.md +++ /dev/null @@ -1,616 +0,0 @@ -# RE2 Feature Gap Implementation Progress - -**Last Updated:** 2025-11-25 -**Current Phase:** ALL PHASES COMPLETE ✅ -**Overall Progress:** 100% (6/6 phases - Phase 6 deferred to 1.0.0 release) - ---- - -## Progress Overview - -| Phase | Status | % Complete | Branch | Tests | Merged | -|-------|--------|------------|--------|-------|--------| -| 0: Native Foundation | ✅ COMPLETE | 100% | feature/re2-native-extensions | ✅ | Yes (PR #11) | -| 1: Bulk Matching | ✅ COMPLETE | 100% | feature/bulk-matching | ✅ | Yes (PR #12) | -| 2: Capture Groups | ✅ COMPLETE | 100% | feature/replace-operations | ✅ | Yes (squashed) | -| 3: Replace Operations | ✅ COMPLETE | 100% | feature/replace-operations | ✅ | Yes (squashed) | -| 4: Utilities | ✅ COMPLETE | 100% | development | ✅ | Yes (this commit) | -| 5: Integration & Polish | ✅ COMPLETE | 100% | development | ✅ | Yes (metrics test) | -| 6: Documentation & Release | DEFERRED | 0% | - | - | For 1.0.0 | - -**Overall:** 459 tests passing ✅ - **Production Ready** - ---- - -## Phase 0: Native Foundation - -**Goal:** Add all required JNI methods -**Branch:** `feature/re2-native-extensions` -**Status:** ✅ COMPLETE -**Started:** 2025-11-22 -**Completed:** 2025-11-24 - -### Checklist - -#### Native Methods - Bulk Matching -- [x] `fullMatchBulk(long handle, String[] texts)` - C++ implementation -- [x] `partialMatchBulk(long handle, String[] texts)` - C++ implementation -- [x] Java JNI declarations in RE2NativeJNI.java - -#### Native Methods - Capture Groups -- [x] `extractGroups(long handle, String text)` - C++ implementation -- [x] `extractGroupsBulk(long handle, String[] texts)` - C++ implementation -- [x] `findAllMatches(long handle, String text)` - C++ implementation -- [x] `getNamedGroups(long handle)` - C++ implementation -- [x] Java JNI declarations in RE2NativeJNI.java - -#### Native Methods - Replace Operations -- [x] `replaceFirst(long handle, String text, String replacement)` - C++ implementation -- [x] `replaceAll(long handle, String text, String replacement)` - C++ implementation -- [x] `replaceAllBulk(long handle, String[] texts, String replacement)` - C++ implementation -- [x] Java JNI declarations in RE2NativeJNI.java - -#### Native Methods - Utilities -- [x] `quoteMeta(String text)` - C++ implementation (static) -- [x] `programFanout(long handle)` - C++ implementation -- [x] Java JNI declarations in RE2NativeJNI.java - -#### Build & Verification -- [x] Update re2_jni.cpp with new method implementations (~480 lines added) -- [x] Update RE2NativeJNI.java with new JNI signatures (13 methods) -- [x] Update native/README.md documentation -- [x] Commit changes (commit afc838f) -- [x] Push branch to GitHub -- [x] Trigger GitHub Actions workflow (run ID: 19597950989) -- [x] Build native library for macOS x86_64 ✅ -- [x] Build native library for macOS ARM64 ✅ -- [x] Build native library for Linux x86_64 ✅ -- [x] Build native library for Linux ARM64 ✅ -- [x] Review auto-generated PR with native libraries (PR #11) -- [x] Merge native library PR to development (merged 2025-11-24) -- [x] Verify libraries load correctly (all 187 tests passed ✅) - -### Work Log - -**2025-11-22 Session 1:** -- Added 13 new JNI method signatures to RE2NativeJNI.java -- Implemented all 13 C++ functions in re2_jni.cpp: - - Bulk matching: fullMatchBulk, partialMatchBulk - - Capture groups: extractGroups, extractGroupsBulk, findAllMatches, getNamedGroups - - Replace: replaceFirst, replaceAll, replaceAllBulk - - Utilities: quoteMeta, programFanout -- Updated native/README.md to reflect 22 total JNI functions (was 9) -- Committed changes (afc838f): 618 insertions, 5 deletions -- Pushed feature/re2-native-extensions branch to GitHub -- Triggered GitHub Actions workflow for multi-platform build - -**Implementation Details:** -- Used JStringGuard for RAII string management -- Bulk operations use std::vector to collect results before returning -- Proper JNI local reference cleanup (DeleteLocalRef after use) -- Capture groups use RE2::Match with StringPiece arrays -- Replace operations use RE2::Replace and RE2::GlobalReplace -- Thread-local error storage for error messages - -**2025-11-24 Build Completion:** -- Fixed programFanout API signature (std::vector not std::map) - commit 70524b1 -- Updated workflow verification to expect 20 functions - commit b272ae5 -- GitHub Actions workflow completed successfully (run ID: 19598320351) -- All 4 platforms built and verified with 20 exported JNI functions ✅ -- PR #11 auto-generated and merged to development -- Native libraries now in src/main/resources/native/ (all 4 platforms) -- Full test suite passed: 187/187 tests ✅ -- **Phase 0 COMPLETE** - -### Blockers - -_None - Phase 0 complete_ - -### Notes - -**Final Deliverables:** -- 20 total JNI functions (9 original + 11 new) -- Bulk matching: fullMatchBulk, partialMatchBulk -- Capture groups: extractGroups, extractGroupsBulk, findAllMatches, getNamedGroups -- Replace operations: replaceFirst, replaceAll, replaceAllBulk -- Utilities: quoteMeta, programFanout -- All platforms verified (macOS x86_64/ARM64, Linux x86_64/ARM64) -- Zero test regressions - -**Next Phase:** Phase 1 - Bulk Matching API (Java layer) - ---- - -## Phase 1: Bulk Matching API - -**Goal:** Minimize JNI overhead for high-throughput matching -**Branch:** `feature/bulk-matching` -**Status:** ✅ COMPLETE -**Started:** 2025-11-24 -**Completed:** 2025-11-24 - -**Dependencies:** Phase 0 complete ✅ - -### Checklist - -#### Core Implementation -- [x] `boolean[] matchAll(Collection inputs)` ✅ -- [x] `boolean[] matchAll(String[] inputs)` ✅ -- [x] `List filter(Collection inputs)` ✅ -- [x] `List filterNot(Collection inputs)` ✅ - -#### Map Filtering -- [x] ` Map filterByKey(Map inputs)` ✅ -- [x] ` Map filterByValue(Map inputs)` ✅ -- [x] ` Map filterNotByKey(Map inputs)` ✅ -- [x] ` Map filterNotByValue(Map inputs)` ✅ - -#### In-Place Filtering -- [x] `int retainMatches(Collection inputs)` ✅ -- [x] `int removeMatches(Collection inputs)` ✅ -- [x] ` int retainMatchesByKey(Map map)` ✅ -- [x] ` int retainMatchesByValue(Map map)` ✅ -- [x] ` int removeMatchesByKey(Map map)` ✅ -- [x] ` int removeMatchesByValue(Map map)` ✅ - -#### Testing -- [x] BulkMatchingTest: 47 tests (all collection types, edge cases) -- [x] BulkMatchingPerformanceTest: 3 benchmarks (skip on QEMU) -- [x] BulkMatchingTypeSafetyTest: 13 tests (Unicode, emoji, type safety) -- [x] RE2NativeJNITest: 40 tests (JNI layer isolation) -- [x] All collection types: ArrayList, LinkedList, HashSet, TreeSet, LinkedHashSet, Queue -- [x] All map types: HashMap, TreeMap, LinkedHashMap, ConcurrentHashMap -- [x] Edge cases: null elements, empty strings, duplicates, 10k datasets - -#### Documentation -- [x] Comprehensive Javadoc with code examples for all 10 methods -- [x] Performance section in libre2-core/README.md -- [x] Benchmark results documented (2.2ms for 10k strings) - -#### Quality Improvements -- [x] Type validation with helpful error messages -- [x] QEMU emulation detection (skip 5 large tests) -- [x] JMX conflict prevention (TestUtils setup) -- [x] Log level optimization (INFO→DEBUG for test noise) -- [x] Enhanced forceClose() with grace period + forced release - -### Work Log - -**2025-11-24 Implementation:** -- Created 10 bulk matching methods in Pattern.java (~500 lines with Javadoc) -- Implemented explicit type validation (IllegalArgumentException with conversion guidance) -- Created 4 test classes (103 new tests total) -- Added QEMU detection to skip performance tests on emulation -- Fixed logging levels (pattern compilation, cache init, thread start: INFO→DEBUG/TRACE) -- Enhanced forceClose() with 2-stage approach (graceful + forced) -- **PR #12 created with 10 commits** - -**2025-11-24 Merge Issues:** -- PR #12 accidentally merged to main instead of development -- Fixed by merging main → development (branches now synchronized) - -**2025-11-24 Post-Merge Optimizations:** -- LongAdder optimization for write-heavy counters (PatternCache, ResourceTracker) -- Fixed resetStatistics() to reset ALL fields - -**Final Deliverables:** -- 10 bulk matching methods (Pattern.java) -- 103 new tests (47 bulk + 3 perf + 13 type safety + 40 JNI) -- Total test count: 290 (187 original + 103 new) -- Performance: 2.2ms for 10k strings, 3.9M matches/sec -- All tests passing on all platforms ✅ - -### Blockers - -_None - Phase 1 complete_ - -### Notes - -**Key Findings:** -- RE2 backreferences use `\\1 \\2` (not `$1 $2`) -- RE2::QuoteMeta escapes more aggressively than expected -- Empty patterns compile successfully (match empty strings) - -**Performance:** -- Simple patterns: Bulk ~same speed as individual (matching cost dominates) -- Complex patterns: Bulk 5-20x faster (JNI overhead significant) - -**Next Phase:** Phase 2 - Capture Groups - ---- - -## Phase 2: Capture Groups - -**Goal:** Enable structured data extraction from matches -**Branch:** `feature/replace-operations` (combined with Phase 3) -**Status:** ✅ COMPLETE -**Started:** 2025-11-24 -**Completed:** 2025-11-25 - -**Dependencies:** Phase 0 complete ✅ - -### Checklist - -#### MatchResult Class -- [x] Create MatchResult class ✅ -- [x] `boolean matched()` ✅ -- [x] `String group()` - full match (group 0) ✅ -- [x] `String group(int index)` - indexed groups ✅ -- [x] `String group(String name)` - named groups ✅ -- [x] `int groupCount()` ✅ -- [x] `String input()` - original input ✅ -- [x] `String[] groups()` - all groups array ✅ -- [x] `Map namedGroups()` - named group map ✅ -- [N/A] `int start()` - match start position (RE2 doesn't provide offsets easily) -- [N/A] `int end()` - match end position (RE2 doesn't provide offsets easily) - -#### Single-String APIs -- [x] `MatchResult match(String input)` ✅ -- [x] `MatchResult find(String input)` ✅ -- [x] `List findAll(String input)` ✅ - -#### Batch APIs -- [x] `MatchResult[] matchAllWithGroups(String[])` ✅ -- [x] `MatchResult[] matchAllWithGroups(Collection)` ✅ -- [N/A] `MatchResult[] findInEach` (Not needed - single findAll sufficient) -- [N/A] `Map> findAllInEach` (Not needed - users can iterate) - -#### Testing -- [x] Unit tests: MatchResult class (35 tests) ✅ -- [x] Unit tests: Indexed group extraction ✅ -- [x] Unit tests: Named group extraction ✅ -- [x] Unit tests: findAll multiple matches ✅ -- [x] Unit tests: Edge cases (no groups, invalid indices, etc.) ✅ -- [x] Real-world scenarios (email, phone, URLs, log parsing) ✅ -- [x] Zero-copy variants (ByteBuffer, address) ✅ -- [x] Bulk capture operations (matchAllWithGroups) ✅ -- [x] Integration test: Metrics verification ✅ - -#### Documentation -- [x] Javadoc for MatchResult class ✅ -- [x] Javadoc for all capture group methods ✅ -- [x] Usage examples in Pattern.java ✅ -- [x] AutoCloseable pattern documented ✅ -- [DEFER] Update QUICKSTART.md (for 1.0.0 release) - -### Work Log - -**2025-11-24 Session 1:** -- Created MatchResult class (immutable, thread-safe, 220 lines) -- Added 3 single-string capture methods to Pattern.java: - - `match(String)` - full match with groups - - `find(String)` - first match with groups - - `findAll(String)` - all matches with groups -- Helper method: `getNamedGroupsMap()` for lazy-loading named groups -- Fix: `match()` validates full match (group[0] must equal input) -- Created CaptureGroupsTest.java - 35 tests -- All tests passing ✅ - -**Implementation Details:** -- MatchResult is immutable final class -- Uses native methods from Phase 0: extractGroups, findAllMatches, getNamedGroups -- Named groups parsed from flattened array [name, index, name, index, ...] -- Full match validation to distinguish match() from find() -- Defensive copies for groups() array - -### Blockers - -_None_ - -### Notes - -**Batch APIs Decision:** -Deferred batch capture group APIs for now. Single-string APIs cover most use cases. -Users can iterate and call `match()`/`find()` if needed. Will evaluate if batch -APIs provide significant value before implementing. - ---- - -## Phase 3: Replace Operations - -**Goal:** Enable regex-based find/replace -**Branch:** `feature/replace-operations` -**Status:** ✅ COMPLETE -**Started:** 2025-11-25 -**Completed:** 2025-11-25 - -**Dependencies:** Phase 0 complete ✅ - -### Checklist - -#### Single-String APIs -- [x] `String replaceFirst(String input, String replacement)` ✅ -- [x] `String replaceAll(String input, String replacement)` ✅ -- [x] Backreference support (\\1, \\2, etc.) ✅ -- [N/A] `String replaceAll(String input, Function replacer)` (DEFERRED - complex, low value) - -#### Batch APIs -- [x] `String[] replaceAll(String[] inputs, String replacement)` ✅ -- [x] `List replaceAll(Collection inputs, String replacement)` ✅ -- [N/A] `String[] replaceFirstInEach` (DEFERRED - replaceFirst rarely needed in bulk) -- [N/A] Custom replacer bulk variants (DEFERRED) - -#### Testing -- [x] Unit tests: replaceFirst ✅ -- [x] Unit tests: replaceAll ✅ -- [x] Unit tests: Backreferences (\\1, \\2, \\3, swapping, reordering) ✅ -- [x] Unit tests: Batch replace operations (array and collection) ✅ -- [x] Unit tests: Edge cases (no matches, empty replacement, special chars, unicode) ✅ -- [x] Real-world scenarios: SSN/CC redaction, phone formatting, batch password sanitization ✅ - -#### Documentation -- [x] Javadoc for all replace methods ✅ -- [x] Usage examples with backreferences ✅ -- [x] Bulk operation examples ✅ -- [ ] Update QUICKSTART.md with replace section (DEFERRED to Phase 5) - -### Work Log - -**2025-11-25 Session 1:** -- Added 4 replace methods to Pattern.java: - - `replaceFirst(String, String)` - replace first match - - `replaceAll(String, String)` - replace all matches - - `replaceAll(String[], String)` - bulk array variant - - `replaceAll(Collection, String)` - bulk collection variant -- Created ReplaceOperationsTest.java - 26 comprehensive tests -- All tests passing ✅ -- Uses native methods from Phase 0 (replaceFirst, replaceAll, replaceAllBulk) - -**Implementation Details:** -- RE2 backreferences use \\1 \\2 (not $1 $2 like Java regex) -- Returns original input if no match found -- Bulk operations process all inputs in single JNI call -- Full JavaDoc with backreference examples -- Proper null validation - -**Test Coverage:** -- Simple replacement (literal strings) -- Backreferences: single (\\1), multiple (\\1 \\2), swapping groups, reordering -- Bulk operations: array and collection variants -- Real-world scenarios: SSN/CC redaction, phone formatting, password sanitization -- Edge cases: no matches, empty replacement, special chars, unicode -- All 409 tests passing (383 existing + 26 new) - -### Blockers - -_None_ - -### Notes - -**Backreference Syntax:** -RE2 uses `\\1` `\\2` (backslash notation), not `$1` `$2` like java.util.regex. -This is clearly documented in JavaDoc with multiple examples. - -**Custom Replacer Function:** -Deferred `replaceAll(String, Function)` as it requires -Java-side iteration and loses bulk performance benefits. Simple iteration with -`find()` or `findAll()` achieves same result if needed. - ---- - -## Phase 4: Utilities - -**Goal:** Add helper functions -**Branch:** `development` -**Status:** ✅ COMPLETE -**Started:** 2025-11-25 -**Completed:** 2025-11-25 - -**Dependencies:** Phase 0 complete ✅ - -### Checklist - -#### Static Utilities -- [x] `static String quoteMeta(String input)` - Pattern.java, RE2.java ✅ -- [N/A] `static String[] quoteMeta(Collection inputs)` (Users can iterate if needed) - -#### Pattern Analysis -- [x] `long getNativeMemoryBytes()` - Pattern.java (equivalent to programSize) ✅ -- [x] `long getProgramSize(String)` - RE2.java ✅ -- [x] `int[] getProgramFanout()` - Pattern.java ✅ -- [x] `int[] getProgramFanout(String)` - RE2.java ✅ - -#### Testing -- [x] Unit tests: quoteMeta (RE2NativeJNITest - 3 tests) ✅ -- [x] Unit tests: programFanout (RE2NativeJNITest) ✅ -- [x] Unit tests: patternMemory (RE2NativeJNITest) ✅ - -#### Documentation -- [x] Javadoc for all utility methods ✅ -- [x] Usage examples for quoteMeta ✅ - -### Work Log - -**2025-11-25 Session:** -- Added `quoteMeta(String)` to Pattern.java with full Javadoc -- Added `getProgramFanout()` to Pattern.java -- Added `getProgramFanout(String)` to RE2.java -- Added `getProgramSize(String)` to RE2.java -- All utility methods exposed in both Pattern and RE2 APIs -- All tests from RE2NativeJNITest already cover these (40 tests) - -### Blockers - -_None_ - -### Notes - -**Implementation:** -- quoteMeta is static (doesn't require compiled pattern) -- programFanout/programSize require compiled pattern -- RE2.java provides convenience wrappers that compile temporarily - ---- - -## Phase 5: Integration & Polish - -**Goal:** Comprehensive testing and documentation -**Branch:** `development` -**Status:** ✅ COMPLETE -**Started:** 2025-11-25 -**Completed:** 2025-11-25 - -**Dependencies:** Phases 0-4 complete ✅ - -### Checklist - -#### Integration Testing -- [x] ComprehensiveMetricsTest: Verifies all operations record metrics ✅ -- [x] Test: All features with caching ✅ -- [x] Test: All features with metrics ✅ -- [x] Zero-copy + bulk combinations tested ✅ - -#### Performance Testing -- [x] BulkMatchingPerformanceTest: 3 benchmarks ✅ -- [x] Performance verified: 2.2ms for 10k strings ✅ -- [N/A] Memory profiling (deferred to production monitoring) - -#### Regression Testing -- [x] All existing tests still pass ✅ -- [x] 459 total tests passing (was 187) ✅ - -#### Documentation -- [x] MetricNames.java: 55 metrics fully documented ✅ -- [x] All public APIs have comprehensive Javadoc ✅ -- [x] Usage examples in all new methods ✅ -- [DEFER] QUICKSTART.md update (for 1.0.0 release) - -#### Quality -- [x] No compiler errors ✅ -- [x] Clean build on all platforms (macOS x86_64/ARM64, Linux x86_64/ARM64) ✅ -- [x] Javadoc complete for all new APIs ✅ -- [x] 13 warnings (sun.nio.ch.DirectBuffer - expected, internal API usage) ⚠️ - -### Work Log - -**2025-11-25 Session:** -- Created ComprehensiveMetricsTest (9 tests) -- Verified all phases work together -- All 459 tests passing -- BUILD SUCCESS on development branch - -### Blockers - -_None_ - -### Notes - -**Quality:** -- All tests passing with zero failures -- Full metrics coverage verified -- Zero-copy + bulk + capture all integrated and working - ---- - -## Phase 6: Documentation & Release - -**Goal:** Prepare 1.0.0 release -**Branch:** `release/1.0.0` -**Status:** NOT STARTED -**Started:** - -**Completed:** - - -**Dependencies:** Phase 5 complete - -### Checklist - -#### Release Preparation -- [ ] Update CHANGELOG.md -- [ ] Version bump to 1.0.0 in all pom.xml files -- [ ] Create migration guide (0.9.x → 1.0.0) -- [ ] Final Javadoc review - -#### Release -- [ ] Merge release branch to main -- [ ] Tag release: `v1.0.0` -- [ ] Create GitHub release with notes -- [ ] Prepare for Maven Central deployment - -### Work Log - -_No work logged yet_ - -### Blockers - -_None_ - -### Notes - -_None_ - ---- - -## Overall Metrics - -### Code Statistics -- **JNI Methods:** 29/29 (20 original + 9 zero-copy) ✅ -- **Pattern.java Methods:** 80+ methods ✅ -- **RE2.java Methods:** 28 static convenience methods ✅ -- **New Classes:** 1 (MatchResult - AutoCloseable) ✅ -- **Tests Added:** 272 new tests ✅ -- **Tests Passing:** 459/459 ✅ - -### Implementation Summary -- **Phase 0:** 20 JNI methods → 29 JNI methods (added zero-copy) -- **Phase 1:** 10 bulk matching methods + 103 tests -- **Phase 2:** MatchResult class + capture methods + 35 tests + bulk variants -- **Phase 3:** 4 replace methods + 26 tests + zero-copy variants -- **Phase 4:** 3 utility methods (quoteMeta, programFanout, programSize) -- **Phase 5:** ComprehensiveMetricsTest (9 tests) + 8 zero-copy JNI tests - -### Time Tracking -- **Estimated Total:** 9 days -- **Actual Spent:** 3 days (2025-11-22 to 2025-11-25) -- **Efficiency:** 3x faster than estimated - -### Issues Encountered -- MatchResult AutoCloseable required test refactoring (35 tests) -- Method overloading conflicts (solved with *WithGroups naming) -- Duplicate method definitions during merges (all resolved) - -### Decisions Made -- Use *WithGroups suffix to avoid Java overloading conflicts -- MatchResult implements AutoCloseable for safety consistency -- Metrics pattern: Global (ALL) + Specific (String/Bulk/Zero-Copy) -- Per-item latency for all bulk operations (comparability) -- Deferred custom replacer functions (low value, users can iterate) - ---- - -## Next Steps - -**All critical phases complete!** ✅ - -1. **Merge to main:** When ready for release -2. **Version 1.0.0:** Update version, CHANGELOG, release notes -3. **Maven Central:** Deploy production-ready artifact -4. **Documentation:** Update QUICKSTART.md with all new features (deferred) - -**Current State:** Production-ready on `development` branch -- 459 tests passing -- 55 metrics instrumented -- All phases (0-5) complete -- Zero-copy support throughout -- Full observability - ---- - -## Session Log - -### Session 2025-11-22 (Planning) -**Duration:** - -**Work Done:** -- Created RE2_GAP_IMPLEMENTATION.md with complete plan -- Created RE2_GAP_PROGRESS.md for tracking -- Analyzed RE2 feature gaps -- Designed API for all missing features - -**Decisions:** -- Use Collection interface for simplicity -- Implement in 7 phases (native first, then features) -- Target 1.0.0 release after all features complete -- Each feature on separate branch off development - -**Next Session:** -- Begin Phase 0: Native Foundation -- Implement bulk matching JNI methods diff --git a/RE2_LINEAR_GUARANTEE.md b/RE2_LINEAR_GUARANTEE.md deleted file mode 100644 index 02c1b7a..0000000 --- a/RE2_LINEAR_GUARANTEE.md +++ /dev/null @@ -1,114 +0,0 @@ -# RE2 Linear Time Guarantee - Clarification - -**Date:** 2025-11-20 - ---- - -## What is the RE2 Linear Time Guarantee? - -**RE2's linear time guarantee is BUILT INTO the RE2 library itself.** - -The C++ RE2 library (which we bind to) guarantees: -- **Linear time complexity:** O(n) where n = input length -- **No catastrophic backtracking:** Unlike PCRE, Java regex, etc. -- **Bounded execution time:** Proportional to input size only - -**This guarantee is inherent to how RE2 works internally:** -- Uses NFA (Non-deterministic Finite Automaton) simulation -- Not backtracking-based like Perl regex -- Designed by Google specifically for this property - ---- - -## Do We Need to Implement Anything? - -**NO! We get the guarantee automatically by using RE2.** - -When we call: -```java -RE2NativeJNI.compile(pattern, caseSensitive) -RE2NativeJNI.fullMatch(handle, input) -``` - -The C++ RE2 library handles everything. The linear time guarantee is automatic. - ---- - -## Why Did We Skip Phase 3 (Timeout)? - -**Phase 3 was about adding TIMEOUT mechanisms on top of RE2's guarantees.** - -**Original Phase 3 plan:** -- ExecutorService-based timeout wrapper -- Ability to cancel long-running operations -- Safety against extremely long (but still linear) operations - -**Why we skipped:** -1. **RE2 already has linear guarantee** (no ReDoS risk) -2. **Timeouts belong at the client level** (e.g., Cassandra query timeout) -3. **Adding timeout here would add complexity for little benefit** -4. **Client can simply stop calling if their query times out** - ---- - -## What Does Skipping Phase 3 Mean? - -**It means we DON'T add timeout mechanisms to libre2-java.** - -**We still have linear time guarantee** because: -- RE2 library provides it natively -- Every call to RE2NativeJNI.* uses RE2's linear algorithms -- No backtracking, no exponential behavior - -**Example:** -```java -// Catastrophic backtracking pattern in other regex engines: -Pattern bad = Pattern.compile("(a+)+b"); -Matcher m = bad.matcher("aaaaaaaaaaaaaaaaaaaaaa!"); // No 'b' at end - -// With Java regex: could take HOURS (exponential) -// With RE2: takes microseconds (linear) -// With libre2-java: takes microseconds (we use RE2!) -``` - ---- - -## Confusion Clarification - -**Your question:** "Is skipping Phase 3 indicating RE2 linear guarantee is not needed?" - -**Answer:** NO! The opposite: -- **RE2's linear guarantee is ALREADY there** (built into RE2 library) -- **Phase 3 was about adding timeouts** (defensive mechanism) -- **We skipped Phase 3 because linear guarantee makes timeouts less critical** - -**Think of it this way:** -- RE2 = Car with built-in anti-lock brakes (linear guarantee = safety feature) -- Phase 3 = Adding a "maximum speed governor" (timeout = extra safety) -- We skipped the governor because anti-lock brakes already prevent crashes - ---- - -## So What Prevents Catastrophic Backtracking? - -**RE2's internal algorithm:** - -1. **Pattern compilation:** Converts regex to NFA (not backtracking tree) -2. **Matching:** Simulates NFA states in linear time -3. **No backtracking:** Doesn't try exponential combinations - -**This is handled entirely by the C++ RE2 library we bind to.** - -We don't implement it - we just call it via JNI. - ---- - -## Summary - -✅ **RE2 linear guarantee:** Built into RE2 library, automatic -✅ **libre2-java gets it:** By binding to RE2 via JNI -✅ **Nothing to implement:** It just works -✅ **Phase 3 skipped:** Timeouts less critical with linear guarantee -✅ **No ReDoS risk:** Guaranteed by RE2's design - -**The linear guarantee is WHY we chose RE2, not something we implement!** diff --git a/RELEASE_READY.md b/RELEASE_READY.md deleted file mode 100644 index 51e47de..0000000 --- a/RELEASE_READY.md +++ /dev/null @@ -1,290 +0,0 @@ -# Version 1.0.0 - Release Ready - -**Date:** 2025-11-25 -**Version:** 1.0.0 -**Branch:** development -**Status:** ✅ **READY FOR RELEASE** - ---- - -## Summary - -libre2-java **1.0.0 is production-ready** with comprehensive regex functionality, full observability, and extensive testing. - -**What's New in 1.0.0:** -- 272 new tests (187 → 459) -- 30 new metrics (25 → 55) -- 50+ new API methods (bulk, capture, replace, zero-copy) -- RE2.java convenience layer (28 static methods) -- Full documentation and migration guide - ---- - -## Release Checklist - -### Code Complete ✅ - -- [x] All features implemented (Phases 0-5) -- [x] 459 tests passing (0 failures, 0 errors) -- [x] All public APIs documented with Javadoc -- [x] Clean build on all platforms -- [x] Zero compiler errors -- [x] Only expected warnings (sun.nio.ch.DirectBuffer - 13 warnings) - -### Documentation Complete ✅ - -- [x] CHANGELOG.md created with full release notes -- [x] QUICKSTART.md comprehensively rewritten -- [x] Migration guide from 0.9.1 included -- [x] All features documented with examples -- [x] Real-world usage examples -- [x] Performance characteristics documented - -### Version Bump Complete ✅ - -- [x] pom.xml: 0.9.1 → 1.0.0 -- [x] libre2-core/pom.xml: parent version updated -- [x] libre2-dropwizard/pom.xml: parent version updated -- [x] Build verified: mvn clean install SUCCESS - -### Cleanup Complete ✅ - -- [x] Removed 7 temporary session documents -- [x] Removed feature/jni-optimization branch (no improvement) -- [x] JNI optimization learnings documented -- [x] Repository clean and organized - ---- - -## Feature Completeness - -### APIs (100%) - -| Category | Methods | Tests | Documented | -|----------|---------|-------|------------| -| Pattern.java | 80+ | ✅ | ✅ | -| RE2.java | 28 | ✅ | ✅ | -| MatchResult | 9 | ✅ | ✅ | -| Matcher | 10 | ✅ | ✅ | -| JNI Layer | 29 | ✅ | ✅ | - -### Features (100%) - -| Feature | Status | Tests | Docs | -|---------|--------|-------|------| -| Pattern Compilation | ✅ | ✅ | ✅ | -| Basic Matching | ✅ | ✅ | ✅ | -| Bulk Matching | ✅ | 78 | ✅ | -| Capture Groups | ✅ | 35 | ✅ | -| Replace Operations | ✅ | 26 | ✅ | -| Zero-Copy (ByteBuffer) | ✅ | 23 | ✅ | -| Utilities | ✅ | 5 | ✅ | -| Pattern Caching | ✅ | 100+ | ✅ | -| Metrics | ✅ | 27 | ✅ | -| Thread Safety | ✅ | 50+ | ✅ | - -### Quality Metrics - -- **Test Coverage:** 459 tests -- **Code Documentation:** 100% of public APIs -- **Metrics Instrumentation:** 55 metrics -- **Platform Support:** macOS (x86_64, ARM64), Linux (x86_64, ARM64) -- **Performance:** 3.6M matches/sec (bulk operations) - ---- - -## Test Summary - -**Total:** 459 tests, 0 failures, 0 errors ✅ - -### By Category -- RE2Test: 106 tests (main API) -- RE2NativeJNITest: 48 tests (JNI layer) -- BulkMatchingTest: 47 tests -- CaptureGroupsTest: 35 tests -- ReplaceOperationsTest: 26 tests -- ByteBufferApiTest: 23 tests -- Phase1ExtensionsTest: 15 tests -- BulkMatchingTypeSafetyTest: 13 tests -- ComprehensiveMetricsTest: 9 tests -- Cache tests: 100+ tests -- Metrics tests: 27 tests -- Performance tests: 7 tests - ---- - -## Performance Characteristics - -### Throughput (Apple Silicon M-series) -- Simple patterns: 10-20M matches/sec -- Complex patterns: 1-5M matches/sec -- Bulk operations (10k strings): ~2-3ms (~3.6M matches/sec) -- Capture groups: ~10% overhead vs simple matching - -### Latency -- Pattern compilation: 50-200μs (cached) -- Cache hit: ~50ns -- Simple match: 50-100ns -- Capture groups: 100-500ns -- Replace: 200-1000ns - -### Memory -- Pattern size: 1-10KB compiled -- 50K pattern cache: 50-500MB - ---- - -## What's New in 1.0.0 - -### Bulk Operations (10-20x Faster) -- `matchAll(String[])` / `matchAll(Collection)` - Bulk full match -- `findAll(String[])` / `findAll(Collection)` - Bulk partial match -- `filter(Collection)` / `filterNot(Collection)` - Bulk filtering -- Map filtering: `filterByKey`, `filterByValue`, etc. -- In-place: `retainMatches`, `removeMatches` -- **78 new tests** - -### Capture Groups -- **MatchResult class** - AutoCloseable with 9 methods -- `match(String)` - Full match with groups -- `find(String)` - Find first with groups -- `findAll(String)` - Find all with groups -- Named group support: `(?P...)` -- Bulk capture: `matchAllWithGroups(String[])` -- **35 new tests** - -### Replace Operations -- `replaceFirst(String, String)` - Replace first -- `replaceAll(String, String)` - Replace all -- Bulk: `replaceAll(String[], String)` -- Backreferences: `\\1`, `\\2`, etc. -- **26 new tests** - -### Zero-Copy Support -- DirectByteBuffer auto-routing (direct → zero-copy, heap → conversion) -- `matches(ByteBuffer)`, `find(ByteBuffer)`, `matchAll(ByteBuffer[])` -- `matchWithGroups(ByteBuffer)`, `findWithGroups(ByteBuffer)` -- `replaceFirst(ByteBuffer, String)`, `replaceAll(ByteBuffer, String)` -- Raw address APIs: `matches(long, int)`, etc. -- **46-99% faster for large DirectByteBuffers** - -### RE2 Convenience Layer -- 28 static methods for quick one-off operations -- `RE2.matches(pattern, input)` -- `RE2.match(pattern, input)` - With capture groups -- `RE2.replaceAll(pattern, input, replacement)` -- All bulk/filter/replace operations available - -### Utilities -- `quoteMeta(String)` - Escape regex special characters -- `getProgramFanout()` - DFA complexity analysis -- `getNativeMemoryBytes()` - Pattern memory size - -### Metrics Expansion -- **55 total metrics** (was 25) -- Matching: 9 metrics (Global + String + Bulk + Zero-Copy) -- Capture: 10 metrics -- Replace: 11 metrics -- Full breakdown for every operation type - ---- - -## Migration from 0.9.1 - -**Backward Compatibility:** ✅ 100% compatible - -All 0.9.1 code continues to work without changes. New features are opt-in. - -**Recommended Updates:** -1. Use bulk APIs for high-throughput scenarios -2. Use MatchResult for capture groups -3. Use RE2 static methods for convenience -4. Monitor new metrics - -**No breaking changes.** - ---- - -## Known Limitations - -### RE2 Feature Limitations (Intentional for ReDoS Safety) -- No lookahead/lookbehind assertions -- No backreferences in patterns (only in replacements) -- No possessive quantifiers -- No atomic groups - -**These are RE2 limitations, not bugs** - They ensure linear-time complexity. - -### JNI Optimization Attempts -- Attempted GetByteArrayRegion optimization (RocksDB research) -- Result: No performance improvement for String inputs -- Reason: String→byte[] conversion overhead cancels gains -- Decision: Keep GetStringUTFChars (handles Unicode correctly) -- Documented: JNI_OPTIMIZATION_CONCLUSION.md - ---- - -## Next Steps for Release - -### Option A: Release to GitHub -1. Merge `development` → `main` -2. Tag `v1.0.0` -3. Create GitHub Release with CHANGELOG -4. Publish artifacts - -### Option B: Maven Central Deployment -1. Complete Option A -2. Configure Maven Central credentials -3. Sign artifacts with GPG -4. Deploy to Maven Central staging -5. Release to public - -### Option C: Additional Polish (Optional) -1. Add more real-world examples -2. Create video tutorial -3. Blog post announcement -4. Update README badges - ---- - -## Repository Status - -**Branch:** development -**Commits ahead of main:** Many (squashed feature work) -**Last commit:** 428abd9 - Prepare for 1.0.0 release -**Build:** SUCCESS ✅ -**Tests:** 459/459 ✅ - -**Ready to merge to main and tag v1.0.0** - ---- - -## Token Usage This Session - -**Total:** 467k / 1M (47%) -**Remaining:** 533k - -**Major Activities:** -1. Fixed all MatchResult test failures (35 tests) -2. Completed Phases 1/2/3 with metrics and zero-copy -3. Populated RE2.java (28 methods) -4. Added utilities (quoteMeta, programFanout, programSize) -5. Attempted and reverted JNI optimization (valuable learning) -6. Cleaned up repository -7. Wrote comprehensive documentation -8. Version bump to 1.0.0 - ---- - -## Recommendation - -**Ready to release 1.0.0** 🚀 - -Library is: -- Feature-complete (all planned features implemented) -- Well-tested (459 tests, 0 failures) -- Fully documented (CHANGELOG, QUICKSTART, Javadoc) -- Production-ready (used in Cassandra SAI index) -- Backward compatible (0.9.1 code works unchanged) - -**Suggested next action:** Merge `development` → `main` and tag `v1.0.0` diff --git a/REMEDIATION_PROGRESS.md b/REMEDIATION_PROGRESS.md deleted file mode 100644 index 9e9689e..0000000 --- a/REMEDIATION_PROGRESS.md +++ /dev/null @@ -1,306 +0,0 @@ -# Phase 1/2/3 Remediation Progress - -**Started:** 2025-11-25 05:00 -**Paused for Native Build:** 2025-11-25 05:37 -**Current Token:** 497k / 1M (50%) -**Branch:** `feature/replace-operations` - ---- - -## Summary - -Systematically fixed metrics instrumentation and added zero-copy support. **BLOCKED** on native library rebuild (in progress, ~10-15 min). - ---- - -## Completed ✅ - -### 1. Metrics Architecture (Complete) -**Structure:** Global (ALL) + Specific (String, Bulk, Zero-Copy) - -**Pattern applied:** -```java -// ALL methods record BOTH global AND specific metrics -metrics.incrementCounter(GLOBAL_OPERATIONS); // e.g., MATCHING_OPERATIONS -metrics.recordTimer(GLOBAL_LATENCY, perItemNanos); -metrics.incrementCounter(SPECIFIC_OPERATIONS); // e.g., MATCHING_BULK_OPERATIONS -metrics.recordTimer(SPECIFIC_LATENCY, perItemNanos); -``` - -**Metrics defined:** -- Matching: 9 metrics (global + string + bulk + zero-copy) -- Capture: 10 metrics (global + string + bulk + zero-copy) -- Replace: 10 metrics (global + string + bulk + zero-copy) -- **Total:** 29 operation metrics + existing 25 = 54 total metrics - -### 2. Metrics Instrumentation (Complete) -**All existing methods now tracked:** - -**Phase 1 - Bulk Matching:** -- ✅ matchAll(String[]) - MATCHING_BULK_* -- ✅ matchAll(long[], int[]) - MATCHING_BULK_ZERO_COPY_* -- ✅ findAll(long[], int[]) - MATCHING_BULK_ZERO_COPY_* -- ✅ All filter/map/retain methods (delegate to matchAll) - -**Phase 2 - Capture Groups:** -- ✅ match(String) - CAPTURE_STRING_* -- ✅ find(String) - CAPTURE_STRING_* -- ✅ findAll(String) - CAPTURE_STRING_* + CAPTURE_FINDALL_MATCHES -- ✅ match(long, int) - CAPTURE_ZERO_COPY_* -- ✅ match(ByteBuffer) - delegates to match(long, int) -- ✅ find(long, int) - CAPTURE_ZERO_COPY_* -- ✅ find(ByteBuffer) - delegates to find(long, int) -- ✅ findAll(long, int) - CAPTURE_ZERO_COPY_* + FINDALL_MATCHES -- ✅ findAll(ByteBuffer) - delegates to findAll(long, int) - -**Phase 3 - Replace:** -- ✅ replaceFirst(String, String) - REPLACE_STRING_* -- ✅ replaceAll(String, String) - REPLACE_STRING_* -- ✅ replaceAll(String[], String) - REPLACE_BULK_* -- ✅ replaceAll(Collection, String) - delegates to replaceAll(String[]) - -**Zero-Copy Matching:** -- ✅ matches(long, int) - MATCHING_ZERO_COPY_* -- ✅ matches(ByteBuffer) - delegates -- ✅ find(long, int) - MATCHING_ZERO_COPY_* -- ✅ find(ByteBuffer) - delegates - -### 3. Phase 2 Zero-Copy (Complete) -**Added 6 methods:** -- ✅ match(long, int), match(ByteBuffer) -- ✅ find(long, int), find(ByteBuffer) -- ✅ findAll(long, int), findAll(ByteBuffer) -- ✅ All with complete metrics (global + specific) -- ✅ ByteBuffer auto-routing (isDirect → zero-copy, heap → String) - -### 4. Native Zero-Copy Replace (Added - Awaiting Build) -**Added 3 C++ functions to re2_jni.cpp:** -- ✅ replaceFirstDirect(handle, address, length, replacement) -- ✅ replaceAllDirect(handle, address, length, replacement) -- ✅ replaceAllDirectBulk(handle, addresses[], lengths[], replacement) - -**Java declarations:** -- ✅ 3 native method signatures in RE2NativeJNI.java - -**Build configuration:** -- ✅ Updated JNI header -- ✅ Updated workflow verification (26 → 29 functions) -- ✅ Triggered GitHub Actions build (ID: 19659456967) - ---- - -## BLOCKED - Waiting for Native Build 🚫 - -**Build Status:** In progress (~10-15 min) -**Run ID:** 19659456967 -**Monitor:** `gh run watch 19659456967` - -**What's being built:** -- macOS x86_64 -- macOS ARM64 -- Linux x86_64 -- Linux ARM64 - -**After build completes:** -1. Review auto-generated PR -2. Merge native libraries into feature/replace-operations -3. Pull updated branch -4. Continue implementation - ---- - -## Remaining Work (After Native Build) - -### Critical Path - -**1. Add Java Zero-Copy Replace Methods** (~30k tokens) -```java -String replaceFirst(long address, int length, String repl) -String replaceFirst(ByteBuffer buffer, String repl) -String replaceAll(long address, int length, String repl) -String replaceAll(ByteBuffer buffer, String repl) -String[] replaceAll(long[] addresses, int[] lengths, String repl) -``` -All with proper metrics instrumentation - -**2. Add Bulk Capture Operations** (~40k tokens) -```java -MatchResult[] matchAll(String[] inputs) -MatchResult[] matchAll(Collection inputs) -List> findAllInEach(String[] inputs) -``` -With metrics - -**3. Populate RE2.java** (~60k tokens) -Add ALL convenience methods mirroring Pattern: -- matches(), find(), match(), findAll() -- replaceFirst(), replaceAll() -- All variants: String, ByteBuffer, Collection - -**4. CREATE COMPREHENSIVE METRICS TEST** (~80k tokens) **[CRITICAL]** -Test suite verifying: -- Every metric is recorded correctly -- Global = sum of specifics -- Counts match operations performed -- Latencies are reasonable -- Bulk items counted correctly - -**5. Additional Tests** (~50k tokens) -- Zero-copy variant tests -- Bulk operation tests -- Integration tests - -**Total Remaining:** ~260k tokens -**Available:** 502k tokens -**Buffer:** 242k tokens - ---- - -## Metrics Pattern (Reference for Remaining Work) - -```java -// Standard pattern for ALL methods: -long startNanos = System.nanoTime(); - -// Execute operation -Type result = nativeMethod(...); - -long durationNanos = System.nanoTime() - startNanos; -long perItemNanos = (bulk) ? durationNanos / count : durationNanos; - -RE2MetricsRegistry metrics = cache.getConfig().metricsRegistry(); - -// GLOBAL metrics (ALL) -metrics.incrementCounter(GLOBAL_OPERATIONS, count); -metrics.recordTimer(GLOBAL_LATENCY, perItemNanos); -metrics.recordTimer(OPERATION_TYPE_LATENCY, perItemNanos); // e.g., FULL_MATCH vs PARTIAL - -// SPECIFIC metrics (String, Bulk, or Zero-Copy) -metrics.incrementCounter(SPECIFIC_OPERATIONS); -metrics.recordTimer(SPECIFIC_LATENCY, perItemNanos); - -// Additional counters for bulk -if (bulk) { - metrics.incrementCounter(SPECIFIC_ITEMS, count); -} -``` - ---- - -## Files Modified (This Session) - -**Modified:** -- `MetricNames.java` - 29 new metric constants -- `Pattern.java` - Instrumented ~20 methods + added 6 Phase 2 zero-copy methods -- `re2_jni.cpp` - Added 3 native replace methods (+150 lines) -- `RE2NativeJNI.java` - Added 3 native declarations -- `com_axonops_libre2_jni_RE2NativeJNI.h` - Added 3 function declarations -- `build-native.yml` - Updated function count verification (26 → 29) - -**Created:** -- `PHASE_123_REMEDIATION_PLAN.md` - Detailed remediation plan -- `REMEDIATION_PROGRESS.md` - This file - ---- - -## Next Session - -1. **Wait for native build to complete** -2. **Merge native library PR** -3. **Pull updated branch** -4. **Continue with Java zero-copy replace methods** -5. **Proceed systematically through remaining work** -6. **CREATE METRICS TEST** (highest priority after native build) - ---- - -**Current Token:** 520k / 1M (52%) -**Commits Since Pause:** 2 (Phase 1 completion + tests) - ---- - -## Updates Since Pause - -### Additional Work Completed During Wait ✅ - -**Phase 1 Complete Coverage:** -- ✅ Added `findAll(String[])` - partial match bulk (was missing!) -- ✅ Added `findAll(Collection)` - delegates to findAll(String[]) -- ✅ Added `matchAll(ByteBuffer[])` - bulk with auto-routing (critical for Cassandra!) -- ✅ Added `findAll(ByteBuffer[])` - bulk partial match with auto-routing -- ✅ All 4 methods have proper metrics (via delegation) - -**Phase 1 Test Coverage (Partial):** -- ✅ Created Phase1ExtensionsTest.java - 16 tests -- ✅ Tests findAll bulk variants -- ✅ Tests ByteBuffer[] bulk variants -- ⚠️ More comprehensive tests needed (deferred) - -**Phase 1 Now Has:** -- 19 total methods (15 original + 4 new) -- All permutations: String, Collection, ByteBuffer[], address/length arrays -- All metrics instrumented (global + specific) -- All delegation patterns correct - ---- - -## Still Blocked - Native Build Status 🚫 - -**Build Status:** Running (re-triggered after workflow fix) -**Run ID:** 19659878221 (previous 19659456967 failed on Linux ARM64) -**Issue:** Linux ARM64 platform check expected 26 instead of 29 - FIXED -**Monitor:** `gh run watch 19659878221` -**ETA:** ~10-15 minutes - -**What was wrong:** -- First build: Only Linux ARM64 verification said 26 (other 3 platforms said 29) -- The previous `replace_all=true` edit didn't catch Linux ARM64 comment variation -- Fixed: All 4 platforms now expect 29 functions - -**Awaiting:** Build completion + PR merge - ---- - -## Updated Remaining Work - -### CRITICAL PATH (After Native Build): - -**1. Java Phase 3 Zero-Copy Replace** (~40k tokens) -- Add 6 Java methods using new native functions: - - replaceFirst(long, int, String) - - replaceFirst(ByteBuffer, String) - - replaceAll(long, int, String) - - replaceAll(ByteBuffer, String) - - replaceAll(long[], int[], String) - - replaceAll(ByteBuffer[], String) -- All with full metrics instrumentation - -**2. CREATE COMPREHENSIVE METRICS TEST** (~100k tokens) **[TOP PRIORITY]** -- Test EVERY metric is recorded -- Verify global = sum of specifics -- Test String vs Bulk vs Zero-Copy tracking -- Test counts, latencies, items -- Test for ALL operation types (matching, capture, replace) - -**3. Populate RE2.java** (~60k tokens) -- Add ~25 convenience static methods -- Mirror Pattern API -- All variants: String, ByteBuffer, Collection - -**4. Add Bulk Capture** (~40k tokens) -- MatchResult[] matchAll(String[]) -- MatchResult[] matchAll(Collection) -- With metrics - -**5. Test Gap Remediation** (~80k tokens) -- Phase 2 zero-copy tests (6 methods) -- Phase 3 String tests expansion -- Phase 3 zero-copy tests (after native build) -- Integration tests - -**Total:** ~320k tokens -**Available:** 480k tokens ✅ - ---- - -**Awaiting native build confirmation to proceed.** diff --git a/SESSION_HANDOFF.md b/SESSION_HANDOFF.md deleted file mode 100644 index 1597ddb..0000000 --- a/SESSION_HANDOFF.md +++ /dev/null @@ -1,277 +0,0 @@ -# Session Handoff - libre2-java COMPLETE ✅ - -**Date:** 2025-11-25 -**Token Used:** 260k / 1M (26%) -**Token Remaining:** 740k (74%) -**Branch:** `feature/replace-operations` -**Commits:** 13 total (all pushed) -**Tests:** **436 passing** (427 + 9 new) ✅ - ---- - -## Summary - -**ALL CRITICAL WORK COMPLETE** ✅ - -Systematically fixed metrics instrumentation across Phases 1/2/3, added complete zero-copy support, populated RE2.java with convenience methods, added bulk capture operations, and created comprehensive metrics tests. MatchResult made AutoCloseable with all tests fixed. - -**Production Ready:** All phases complete, all tests passing, full observability. - ---- - -## Session Progress Update - -**Token Usage:** 260k / 1M (26%) - **740k remaining** -**Tests:** All **436 tests passing** ✅ -**Last Update:** 2025-11-25 11:39 UTC - -### Completed This Session (ALL TASKS): -1. ✅ Fixed all 35 CaptureGroupsTest failures (try-with-resources for MatchResult) -2. ✅ Native build for Phase 3 zero-copy replace (PR #15 merged) -3. ✅ Populated RE2.java with 22 convenience methods (3 → 25 total) -4. ✅ Added bulk capture operations (matchAllWithGroups) -5. ✅ Fixed all duplicate method signature conflicts -6. ✅ Added Phase 3 zero-copy replace to Pattern.java (6 methods) -7. ✅ Created comprehensive metrics test (9 tests) -8. ✅ **ALL CRITICAL PATH WORK COMPLETE** - ---- - -## What's DONE ✅ - -### 1. Metrics Architecture (COMPLETE) -**54 total metrics defined:** -- Pattern: Global (ALL) + String + Bulk + Zero-Copy for each operation type -- Matching: 9 metrics -- Capture: 10 metrics -- Replace: 10 metrics -- Cache/Resource: 25 existing metrics - -**Key Pattern:** -```java -// Every method records BOTH: -metrics.incrementCounter(GLOBAL_OPERATIONS); // e.g., MATCHING_OPERATIONS (ALL) -metrics.recordTimer(GLOBAL_LATENCY); -metrics.incrementCounter(SPECIFIC_OPERATIONS); // e.g., MATCHING_BULK_OPERATIONS -metrics.recordTimer(SPECIFIC_LATENCY); -``` - -**Consistency:** All latencies use per-item for bulk (comparability) - -### 2. Full Metrics Instrumentation (COMPLETE) -**ALL existing methods now tracked:** -- Phase 1: 19 methods (matchAll, findAll, filter, map operations) -- Phase 2: 9 methods (match, find, findAll + zero-copy variants) -- Phase 3: 4 methods (replaceFirst, replaceAll + bulk) -- Zero-copy: All address/length and ByteBuffer methods - -### 3. Phase 1 Extensions (COMPLETE) -**Added 4 methods:** -- `findAll(String[])` - partial match bulk -- `findAll(Collection)` - delegates -- `matchAll(ByteBuffer[])` - bulk with auto-routing -- `findAll(ByteBuffer[])` - bulk with auto-routing - -### 4. Phase 2 Zero-Copy (COMPLETE) -**Added 6 methods with renamed signatures:** -- `matchWithGroups(long, int)` / `matchWithGroups(ByteBuffer)` -- `findWithGroups(long, int)` / `findWithGroups(ByteBuffer)` -- `findAllWithGroups(long, int)` / `findAllWithGroups(ByteBuffer)` - -**Naming:** *WithGroups suffix avoids Java overloading conflicts (can't overload by return type) - -### 5. Phase 3 Zero-Copy (NATIVE READY - Awaiting User Merge) -**6 methods planned:** -- `replaceFirst(long, int, String)` / `replaceFirst(ByteBuffer, String)` -- `replaceAll(long, int, String)` / `replaceAll(ByteBuffer, String)` -- `replaceAll(long[], int[], String)` / `replaceAll(ByteBuffer[], String)` - -**Status:** ✅ Native build complete, **PR #15 ready to merge** -- 3 JNI native methods implemented: replaceFirstDirect, replaceAllDirect, replaceAllDirectBulk -- All 4 platforms built successfully (29 JNI functions verified) -- Waiting for user to merge PR #15 into feature/replace-operations -- Then add 6 Java wrapper methods to Pattern.java (~20 lines of code) - -### 6. MatchResult AutoCloseable (COMPLETE) -**Added full safety pattern:** -- `implements AutoCloseable` -- `AtomicBoolean closed` -- `checkNotClosed()` on ALL public methods -- `close()` method -- Full JavaDoc explaining try-with-resources requirement - -**Was broken:** 35 CaptureGroupsTest failures -**Fixed:** All tests updated to use try-with-resources pattern ✅ - -### 7. RE2.java Convenience Methods (COMPLETE) -**Added 22 static convenience methods:** -- String operations: match, findFirst, findAll -- Bulk operations: matchAll, matchAllWithGroups, findAll, filter, filterNot -- Replace operations: replaceFirst, replaceAll (single + bulk + collection) -- ByteBuffer operations: matches, matchWithGroups, findWithGroups, findAllWithGroups -- Utility: quoteMeta - -**Total methods:** 25 (was 3) -**Purpose:** Makes library easier to use without explicit Pattern.compile() - -### 8. Bulk Capture Operations (COMPLETE) -**Added methods:** -- `MatchResult[] matchAllWithGroups(String[])` - bulk full match with groups -- `MatchResult[] matchAllWithGroups(Collection)` - collection variant -- Full metrics instrumentation (Global + Bulk) - -**Implementation:** Iterates extractGroups per input (can optimize with native bulk later) - -### 7. Native Support (COMPLETE) -- 29 JNI functions (20 + 6 matching + 3 replace) -- All 4 platforms built and merged - ---- - -## What's FIXED ✅ (Was Broken) - -**Tests: All 427 passing** ✅ - -**Was broken:** 35 failures in CaptureGroupsTest due to MatchResult AutoCloseable -**Fixed by:** Adding try-with-resources to all MatchResult usages: -```java -// OLD (broken): -MatchResult result = pattern.match("text"); -result.group(1); // Throws: MatchResult is closed - -// NEW (correct): -try (MatchResult result = pattern.match("text")) { - result.group(1); // Works -} -``` - -**Files needing fixes:** -- CaptureGroupsTest.java - 24 MatchResult usages -- Possibly ComprehensiveMetricsTest.java -- Any other files using MatchResult - -**Estimated fix:** 20k tokens (manual try-with-resources wrapping) - ---- - -## Critical Remaining Work - -| Task | Tokens Used | Priority | Status | -|------|-------------|----------|--------| -| ✅ Fix MatchResult tests | 18k | CRITICAL | **DONE** | -| ✅ Populate RE2.java (~25 methods) | 45k | HIGH | **DONE** | -| ✅ Add bulk capture ops | 30k | HIGH | **DONE** | -| ✅ Add Phase 3 zero-copy | 50k | HIGH | **DONE** | -| ✅ Complete metrics test | 40k | CRITICAL | **DONE** | - -**Total Used This Session:** ~260k tokens -**Remaining:** 740k tokens (74%) -**Status:** ✅ **ALL CRITICAL WORK COMPLETE** - ---- - -## Final Deliverables Summary - -### API Completeness -- ✅ **Pattern.java:** 80+ methods across all phases (String, ByteBuffer, address, bulk) -- ✅ **RE2.java:** 25 static convenience methods -- ✅ **MatchResult:** Full AutoCloseable with safety checks -- ✅ **All operations:** String + ByteBuffer + Zero-Copy + Bulk variants - -### Metrics Instrumentation (55 metrics) -- ✅ **Matching:** 9 metrics (Global + String + Bulk + Zero-Copy) -- ✅ **Capture:** 10 metrics (Global + String + Bulk + Zero-Copy) -- ✅ **Replace:** 11 metrics (Global + String + Bulk + Zero-Copy + Bulk Zero-Copy) -- ✅ **Cache:** 25 existing metrics -- ✅ **Comprehensive test:** 9 new tests verifying metrics - -### Zero-Copy Support -- ✅ **Phase 1:** matchAll, findAll with address/ByteBuffer[] -- ✅ **Phase 2:** matchWithGroups, findWithGroups, findAllWithGroups (address + ByteBuffer) -- ✅ **Phase 3:** replaceFirst, replaceAll (address + ByteBuffer + bulk) - -### Native Library -- ✅ **29 JNI functions:** All platforms built and merged -- ✅ **C++ wrapper:** Complete with error handling -- ✅ **All platforms:** macOS (Intel + ARM), Linux (x86_64 + ARM64) - -### Testing -- ✅ **436 tests passing:** Zero failures, zero errors -- ✅ **Coverage:** All phases, all variants, all edge cases -- ✅ **Metrics test:** Verifies observability working - ---- - -## Commits Pushed (13 total) - -**3 local commits on feature/replace-operations:** - -1. `71f7358` - Fix metrics structure (Global + Specific) -2. `607080a` - Add Phase 2 zero-copy (matchWithGroups etc.) -3. `580e972` - Add MatchResult AutoCloseable - -**Status:** Not pushed (tests broken) - ---- - -## Recommendations - -**Option A: Fix tests and continue** (~260k tokens) -- Fix 35 CaptureGroupsTest failures -- Add Phase 3 zero-copy -- Add bulk capture -- Populate RE2.java -- Complete metrics test -- **Achievable within remaining tokens** - -**Option B: Revert MatchResult AutoCloseable temporarily** -- Remove MatchResult AutoCloseable -- Get tests passing -- Complete other work -- Add MatchResult AutoCloseable as final step -- **Safer but compromises safety temporarily** - -**Option C: Pause and review** -- Current state documented -- User decides priorities -- Resume in next session - ---- - -## My Assessment - -**You were right** to demand MatchResult AutoCloseable for safety consistency. -**I can complete the fix** with remaining tokens (388k available, 260k needed). -**Tests are fixable** - just tedious try-with-resources wrapping. - -**Recommend: Option A** - Fix tests and complete critical path. - ---- - -## Next Session Priorities (Updated) - -**Immediate Next Steps (no native build required):** - -1. **Populate RE2.java** (~60k tokens) - - Add ~25 static convenience methods - - Mirror Pattern API for common operations - - Makes library easier to use for simple cases - -2. **Add Bulk Capture Operations** (~40k tokens) - - `MatchResult[] matchAll(String[])` - - `MatchResult[] matchAll(Collection)` - - `List> findAllInEach(String[])` - - With full metrics - -3. **Complete Comprehensive Metrics Test** (~80k tokens) - - Verify EVERY method records metrics - - Test global = sum of specifics for ALL operations - - Test bulk items counted correctly - -**Blocked (requires user to trigger native builds):** -- Phase 3 zero-copy replace (needs 3 new JNI functions + C++ + native builds) - -**Ready to Continue:** -- All 427 tests passing ✅ -- 900k tokens available -- Can complete all non-native tasks in this session diff --git a/chronicle_progress.md b/chronicle_progress.md deleted file mode 100644 index 0a9b915..0000000 --- a/chronicle_progress.md +++ /dev/null @@ -1,496 +0,0 @@ -# Chronicle Zero-Copy Integration Progress - -**Branch:** `feature/chronicle-zero-copy` -**Started:** 2025-11-24 -**Last Updated:** 2025-11-24 - ---- - -## Token Usage Tracking - -| Session | Start | End | Used | Notes | -|---------|-------|-----|------|-------| -| 1 (Sonnet 4.5) | 0 | 120k | 120k | Initial implementation, native build | -| 2 (Sonnet 4.5 1M) | 120k | ~173k | ~53k | Test fixes, benchmarks complete | -| 3 (Sonnet 4.5 1M) | 173k | ~310k | ~137k | Public API iterations + ByteBuffer support | - -**Total: ~310k / 1M tokens (31%)** - ---- - -## Phase 1: Zero-Copy JNI Implementation - -### Objectives -1. Update RE2NativeJNI.java with direct memory methods -2. Update re2_jni.cpp with StringPiece-based implementations -3. Create RE2DirectMemory.java helper class -4. Add Chronicle Bytes dependency (shaded to avoid version conflicts) -5. Create comprehensive tests and benchmarks - -### Progress Checklist - -#### Infrastructure ✅ COMPLETE -- [x] Create feature branch from development -- [x] Create chronicle_progress.md tracking file -- [x] Add Chronicle Bytes dependency with maven-shade-plugin -- [x] Configure shading to relocate Chronicle classes (`net.openhft` → `com.axonops.libre2.shaded.openhft`) -- [x] Add JVM arguments for Chronicle Java 17+ compatibility - -#### Java Implementation ✅ COMPLETE -- [x] Add direct memory native method declarations to RE2NativeJNI.java - - [x] `fullMatchDirect(long handle, long address, int length)` - - [x] `partialMatchDirect(long handle, long address, int length)` - - [x] `fullMatchDirectBulk(long handle, long[] addresses, int[] lengths)` - - [x] `partialMatchDirectBulk(long handle, long[] addresses, int[] lengths)` - - [x] `extractGroupsDirect(long handle, long address, int length)` - - [x] `findAllMatchesDirect(long handle, long address, int length)` -- [x] Create RE2DirectMemory.java helper class - - [x] Chronicle Bytes integration methods - - [x] Memory lifecycle management - - [x] Convenience methods for common operations - - [x] Full JavaDoc documentation - -#### Native (C++) Implementation ✅ COMPLETE -- [x] Add direct memory JNI functions to re2_jni.cpp - - [x] `fullMatchDirect` - uses StringPiece for zero-copy - - [x] `partialMatchDirect` - uses StringPiece for zero-copy - - [x] `fullMatchDirectBulk` - bulk operations with direct memory - - [x] `partialMatchDirectBulk` - bulk operations with direct memory - - [x] `extractGroupsDirect` - capture groups with zero-copy input - - [x] `findAllMatchesDirect` - find all with zero-copy input -- [x] Regenerate JNI header file -- [x] Rebuild native library (via GitHub Actions) -- [x] Update workflow to expect 26 functions (was 20) - -#### Testing ✅ COMPLETE -- [x] Create DirectMemoryTest.java - 38 correctness tests (all passing) -- [x] Create ZeroCopyPerformanceTest.java - 11 benchmarks (all passing) -- [x] Fixed Chronicle Bytes memory model (must use direct, not heap) -- [x] Fixed try-with-resources (Chronicle doesn't implement AutoCloseable) - -#### Documentation ✅ COMPLETE -- [x] Add comprehensive JavaDoc to all new methods -- [x] Document memory lifecycle requirements -- [x] Document performance characteristics -- [x] Add usage examples in JavaDoc - ---- - -## Benchmark Results - -### Expected Performance Gains -| Input Size | Expected Improvement | -|------------|---------------------| -| <100 bytes | 10-30% | -| 1KB-10KB | 30-50% | -| >10KB | 50-100% | - -### Actual Results ✅ MEASURED - -**Platform:** macOS aarch64 (Apple Silicon) -**Pattern:** Email regex (moderately complex) -**Iterations:** 10,000 per test - -| Input Size | String API (ns/op) | Direct API (ns/op) | Speedup | -|------------|-------------------:|------------------:|--------:| -| 64B | 380.20 | 205.75 | **45.9%** | -| 256B | 691.03 | 182.91 | **73.5%** | -| 1KB | 1,848.31 | 194.38 | **89.5%** | -| 4KB | 6,473.85 | 141.00 | **97.8%** | -| 10KB | 15,869.94 | 151.59 | **99.0%** | -| 50KB | 77,418.88 | 148.67 | **99.8%** | -| 100KB | 155,381.58 | 141.25 | **99.9%** | - -### Bulk Operations -| Operation | String API (ns/batch) | Direct API (ns/batch) | Speedup | -|-----------|----------------------:|----------------------:|--------:| -| 100 x 1KB | 186,397.42 | 15,928.79 | **91.5%** | - -**Key Findings:** -1. **Vastly exceeds expectations** - seeing 99%+ improvement for large inputs vs. expected 50-100% -2. **Consistent performance** - Direct API maintains ~150-200 ns/op regardless of input size -3. **String API degrades linearly** - Copy overhead dominates with larger inputs -4. **Bulk operations excel** - 91.5% faster for batch processing - ---- - -## Known Issues - -### Resolved Issues - -**Issue 1: Chronicle Bytes requires Java 17+ JVM arguments** -- **Status:** RESOLVED -- **Description:** Chronicle Bytes needs access to JDK internals, requires `--add-opens` flags -- **Solution:** Added argLine configuration to maven-surefire-plugin with required --add-opens arguments -- **Impact:** Tests now run successfully on Java 17+ - -**Issue 2: Heap-backed Bytes don't support addressForRead()** -- **Status:** RESOLVED -- **Description:** `Bytes.from(String)` creates heap-backed memory which doesn't provide native addresses -- **Solution:** Use `Bytes.allocateElasticDirect()` to create off-heap memory instead -- **Impact:** All tests and helpers use direct memory allocation - -**Issue 3: Chronicle Bytes doesn't implement AutoCloseable** -- **Status:** RESOLVED -- **Description:** Cannot use try-with-resources for Chronicle Bytes -- **Solution:** Created `withBytes()` helper method with try-finally and explicit `releaseLast()` -- **Impact:** Clean resource management in all tests - ---- - -## Decision Log - -### Decision 1: Maven Shade Plugin for Chronicle Bytes -- **Status:** DECIDED -- **What:** Use maven-shade-plugin to relocate Chronicle Bytes classes -- **Why:** Avoid version conflicts with existing Chronicle libraries in user's JVM -- **Trade-off:** Larger JAR size vs. guaranteed compatibility -- **Relocation:** `net.openhft` → `com.axonops.libre2.shaded.openhft` - -### Decision 2: Direct Memory Method Naming -- **Status:** DECIDED -- **What:** Use `*Direct` suffix for zero-copy methods -- **Why:** Clear distinction from String-based methods -- **Examples:** `fullMatchDirect`, `partialMatchDirect` - ---- - -## Files Modified/Created - -### New Files -- `chronicle_progress.md` - Progress tracking file -- `libre2-core/src/main/java/com/axonops/libre2/jni/RE2DirectMemory.java` - Chronicle Bytes helper (240 lines) -- `libre2-core/src/test/java/com/axonops/libre2/jni/DirectMemoryTest.java` - 38 JNI layer tests -- `libre2-core/src/test/java/com/axonops/libre2/jni/ZeroCopyPerformanceTest.java` - 11 performance benchmarks -- `libre2-core/src/test/java/com/axonops/libre2/api/OffHeapMatchingTest.java` - 17 address/length API tests -- `libre2-core/src/test/java/com/axonops/libre2/api/ByteBufferApiTest.java` - 23 ByteBuffer API tests - -### Modified Files -- `pom.xml` - Chronicle dependency version + shade plugin -- `libre2-core/pom.xml` - Chronicle dependency + shade plugin config + surefire JVM args -- `libre2-core/src/main/java/com/axonops/libre2/jni/RE2NativeJNI.java` - Added 6 Direct JNI methods (+158 lines) -- `libre2-core/src/main/java/com/axonops/libre2/api/Pattern.java` - Added 10 overloaded methods (+280 lines) -- `native/wrapper/re2_jni.cpp` - Native implementations (+347 lines) -- `native/jni/com_axonops_libre2_jni_RE2NativeJNI.h` - Added 6 function declarations -- `.github/workflows/build-native.yml` - Updated function count 20→26 -- `libre2-core/src/main/resources/native/darwin-aarch64/libre2.dylib` - Rebuilt (+2.4KB) -- `libre2-core/src/main/resources/native/darwin-x86_64/libre2.dylib` - Rebuilt (+10.7KB) -- `libre2-core/src/main/resources/native/linux-aarch64/libre2.so` - Rebuilt (+768B) -- `libre2-core/src/main/resources/native/linux-x86_64/libre2.so` - Rebuilt (+13.6KB) - ---- - -## Session Notes - -### Session 1 (2025-11-24) - Initial Implementation -**Work completed:** -- Created feature branch `feature/chronicle-zero-copy` -- Created progress tracking file -- Analyzed existing RE2NativeJNI.java and re2_jni.cpp -- Added Chronicle Bytes dependency with maven-shade-plugin -- Implemented 6 new zero-copy JNI methods -- Created RE2DirectMemory helper class -- Created correctness tests (DirectMemoryTest.java) -- Created performance benchmarks (ZeroCopyPerformanceTest.java) -- Updated workflow to expect 26 JNI functions -- Triggered native library rebuild via GitHub Actions - -**Current copy points identified:** -1. Java String → JNI GetStringUTFChars() copies to native buffer -2. JStringGuard class manages this copy with RAII -3. RE2 uses StringPiece which is zero-copy when given const char* -4. **Solution:** Pass direct memory address from Chronicle Bytes → skip steps 1 and 2 - -### Session 2 (2025-11-24) - Testing & Validation -**Work completed:** -- Pulled rebuilt native libraries from GitHub Actions -- Fixed Chronicle Bytes compatibility issues: - - Added JVM arguments for Java 17+ module access - - Changed from heap-backed to direct memory allocation - - Fixed resource management (manual try-finally instead of try-with-resources) -- All 38 correctness tests passing -- All 11 performance benchmarks passing -- **Results:** 45.9% to 99.9% performance improvement depending on input size - -**Test results:** -- DirectMemoryTest: 38/38 tests passed ✅ -- ZeroCopyPerformanceTest: 11/11 benchmarks passed ✅ -- Full test suite: 374/374 tests passed ✅ (no regressions) - -### Session 3 (2025-11-24) - Public API Exposure (FINAL) - -**Evolution of approach:** - -1. **Iteration 1 (rejected):** ZeroCopyPattern adapter classes - - Problem: Forces users to choose String OR zero-copy - - Problem: Exposes Chronicle types in public API - -2. **Iteration 2:** Raw address/length overloads - - Added `matches(long address, int length)` etc. - - Works with any off-heap system - - But requires manual address extraction - -3. **Iteration 3 (final):** Added ByteBuffer API - - Added `matches(ByteBuffer)` with intelligent routing - - Direct → zero-copy, heap → String - - Standard Java, no external dependencies - - Uses reflection to avoid sun.nio.ch compile dependency - -**Final implementation:** -- Removed ZeroCopyPattern and ZeroCopyRE2 adapter classes -- Added 10 overloaded methods to Pattern.java: - - 6 methods accepting (long address, int length) - - 4 methods accepting (ByteBuffer) with auto-routing -- Created OffHeapMatchingTest.java - 17 tests -- Created ByteBufferApiTest.java - 23 tests -- Uses reflection to extract DirectByteBuffer address (no compile-time dependency) -- All tests passing - -**Design decisions:** -- ✅ No Chronicle types in public API -- ✅ ByteBuffer API auto-routes based on isDirect() -- ✅ Reflection for DirectBuffer.address() (no sun.nio.ch dependency) -- ✅ Natural mixed usage: String + ByteBuffer + raw address in same Pattern - -**Test results:** -- ByteBufferApiTest: 23/23 tests passed ✅ -- OffHeapMatchingTest: 17/17 tests passed ✅ -- DirectMemoryTest (JNI): 38/38 tests passed ✅ -- Full test suite: 414/414 tests passed ✅ (no regressions) - ---- - -## Phase 1 Summary - COMPLETE ✅ - -### Achievements - -**Zero-Copy Implementation Complete:** -- ✅ 6 new JNI methods for direct memory access -- ✅ RE2DirectMemory helper class for Chronicle Bytes integration -- ✅ Native libraries rebuilt for all 4 platforms -- ✅ 38 correctness tests - all passing -- ✅ 11 performance benchmarks - all passing -- ✅ No regressions in existing tests (374 total) - -**Performance Results:** -- **Small inputs (64-256B):** 46-74% faster -- **Medium inputs (1-4KB):** 90-98% faster -- **Large inputs (10-100KB):** 99%+ faster -- **Bulk operations:** 91.5% faster - -**Key Insight:** The Direct API maintains constant ~150-200ns/op regardless of input size, while the String API degrades linearly due to copy overhead. - ---- - -## Phase 2: Public API Exposure - COMPLETE ✅ - -### Objectives - -Expose zero-copy functionality through clean public API that: -- Works with ANY off-heap memory system (Chronicle Bytes, DirectByteBuffer, Netty, etc.) -- Doesn't expose Chronicle types in public API -- Supports mixed usage (String + off-heap in same app) -- Zero breaking changes to existing code -- Intelligent routing (DirectByteBuffer → zero-copy, heap ByteBuffer → String) - -### Design Decision: Method Overloading - -**Rejected Approach:** Adapter classes like `ZeroCopyPattern` -- Problem: Assumes all usage is zero-copy (unrealistic) -- Problem: Exposes Chronicle types in public API -- Problem: Extra complexity for users - -**Chosen Approach:** Simple overloaded methods on Pattern -- `matches(String)` - existing String API -- `matches(long address, int length)` - zero-copy for ANY off-heap memory -- `matches(ByteBuffer)` - auto-routes to zero-copy (direct) or String (heap) -- Users mix String and off-heap naturally in same app - -### Implementation - -**Updated Pattern.java:** - -Added 10 new overloaded methods in 2 categories: - -**Raw Address API** (advanced users, any off-heap system): -- `matches(long address, int length)` - full match, zero-copy -- `find(long address, int length)` - partial match, zero-copy -- `matchAll(long[] addresses, int[] lengths)` - bulk full match -- `findAll(long[] addresses, int[] lengths)` - bulk partial match -- `extractGroups(long address, int length)` - capture groups -- `findAllMatches(long address, int length)` - find all matches - -**ByteBuffer API** (standard Java, automatic routing): -- `matches(ByteBuffer)` - auto-routes: direct→zero-copy, heap→String -- `find(ByteBuffer)` - auto-routes -- `extractGroups(ByteBuffer)` - auto-routes -- `findAllMatches(ByteBuffer)` - auto-routes - -**Technical Details:** -- Uses reflection to extract address from DirectByteBuffer (no compile-time dependency on sun.nio.ch) -- Falls back gracefully to String API if reflection fails -- Respects ByteBuffer position/limit without modifying them -- UTF-8 encoding for heap ByteBuffer conversion - -**Helper for Chronicle Users:** -- `RE2DirectMemory.java` (in jni package) - convenience wrapper accepting Bytes objects directly - -**Tests:** -- `OffHeapMatchingTest.java` - 17 tests with Chronicle Bytes (address/length API) -- `ByteBufferApiTest.java` - 23 tests with ByteBuffer (auto-routing) -- All tests verify off-heap results match String API results - -### Test Results - -- **ByteBufferApiTest:** 23/23 tests passed ✅ -- **OffHeapMatchingTest:** 17/17 tests passed ✅ -- **DirectMemoryTest (JNI layer):** 38/38 tests passed ✅ -- **Full test suite:** 414/414 tests passed ✅ -- **No regressions** - -### Usage Examples - -**Option 1: ByteBuffer (Standard Java, Auto-Routing)** -```java -Pattern pattern = Pattern.compile("\\d+"); - -// DirectByteBuffer - automatically uses zero-copy (46-99% faster) -ByteBuffer directBuffer = ByteBuffer.allocateDirect(1024); -directBuffer.put("12345".getBytes(StandardCharsets.UTF_8)); -directBuffer.flip(); -boolean r1 = pattern.matches(directBuffer); // Zero-copy! - -// Heap ByteBuffer - automatically falls back to String -ByteBuffer heapBuffer = ByteBuffer.wrap("67890".getBytes()); -boolean r2 = pattern.matches(heapBuffer); // String conversion - -// Mix them naturally -boolean r3 = pattern.matches("abc"); // String API -``` - -**Option 2: Chronicle Bytes (Raw Address API)** -```java -Pattern pattern = Pattern.compile("\\d+"); - -// Extract address/length from Chronicle Bytes -Bytes bytes = Bytes.allocateElasticDirect(); -try { - bytes.write("67890".getBytes(StandardCharsets.UTF_8)); - long address = bytes.addressForRead(0); - int length = (int) bytes.readRemaining(); - boolean matches = pattern.matches(address, length); // 46-99% faster! -} finally { - bytes.releaseLast(); -} -``` - -**Option 3: Bulk Matching (Chronicle Bytes)** -```java -Pattern pattern = Pattern.compile("valid_.*"); -Bytes[] bytesArray = ...; // Multiple off-heap buffers - -// Extract addresses/lengths -long[] addresses = new long[bytesArray.length]; -int[] lengths = new int[bytesArray.length]; -for (int i = 0; i < bytesArray.length; i++) { - addresses[i] = bytesArray[i].addressForRead(0); - lengths[i] = (int) bytesArray[i].readRemaining(); -} - -boolean[] results = pattern.matchAll(addresses, lengths); // 91.5% faster! -``` - -**Option 4: Mixed Usage (Real-World)** -```java -Pattern emailPattern = Pattern.compile("[a-z]+@[a-z]+\\.[a-z]+"); - -// Process different data sources with same pattern -boolean r1 = emailPattern.matches("user@example.com"); // String - -ByteBuffer networkBuffer = getNetworkBuffer(); // DirectByteBuffer from Netty -boolean r2 = emailPattern.find(networkBuffer); // Zero-copy - -Bytes chronicleBytes = getFromCache(); // Chronicle Bytes -long addr = chronicleBytes.addressForRead(0); -int len = (int) chronicleBytes.readRemaining(); -boolean r3 = emailPattern.matches(addr, len); // Zero-copy - -// All work with same Pattern instance! -``` - -### Architecture Benefits - -✅ **No Chronicle types in public API** - accepts raw `long address`, `int length`, or `ByteBuffer` -✅ **Works with ANY off-heap system** - Chronicle Bytes, DirectByteBuffer, Netty ByteBuf, etc. -✅ **Intelligent routing** - ByteBuffer API auto-detects direct vs heap and routes appropriately -✅ **Natural mixed usage** - String and off-heap in same app, same Pattern -✅ **Zero breaking changes** - existing String API unchanged -✅ **Simple API** - just overloaded methods, no adapters needed -✅ **Standard Java support** - ByteBuffer is java.nio (no external deps needed) -✅ **Reflection-based** - No compile-time dependency on sun.nio.ch.DirectBuffer -✅ **Helper available** - RE2DirectMemory for Chronicle Bytes convenience (optional) - ---- - -## Next Steps (Future Phases) - -**Phase 3: Chronicle Map Cache (Optional)** -- Replace PatternCache with Chronicle Map for off-heap caching -- Further reduce GC pressure -- Optional persistence for fast restarts - -**Phase 4: NUMA Optimization (Advanced)** -- Per-NUMA-socket caches using Chronicle Thread Affinity -- Topology-aware pattern distribution -- For multi-socket servers only - ---- - -## Final Summary - -### What Was Delivered - -**Complete zero-copy regex matching for libre2-java** with exceptional performance and flexible API. - -**Public API (Pattern.java) - 3 usage modes:** - -1. **String API** (existing, unchanged) - ```java - pattern.matches("text") - ``` - -2. **ByteBuffer API** (standard Java, intelligent routing) - ```java - pattern.matches(byteBuffer) // Auto-detects direct vs heap - ``` - -3. **Raw Address API** (advanced, any off-heap system) - ```java - pattern.matches(address, length) // Maximum control - ``` - -**Performance:** -- **Small (64-256B):** 46-74% faster -- **Medium (1-4KB):** 90-98% faster -- **Large (10-100KB):** 99%+ faster -- **Bulk (100x1KB):** 91.5% faster - -**Architecture:** -- ✅ No Chronicle types in public API -- ✅ Works with ANY off-heap system -- ✅ Natural mixed usage -- ✅ Zero breaking changes -- ✅ 414 tests passing - -**Code Stats:** -- **New files:** 6 (~850 lines) -- **Modified files:** 11 (~850 lines) -- **Total:** ~1,700 lines of production code + tests -- **Native libs:** Rebuilt for 4 platforms (+27KB total) - -**Token usage:** 310k / 1M (31%) - -**Branch:** `feature/chronicle-zero-copy` -**Status:** ✅ READY FOR PR - ---- diff --git a/libre2-core/pom.xml b/libre2-core/pom.xml index a369b7b..67d25e2 100644 --- a/libre2-core/pom.xml +++ b/libre2-core/pom.xml @@ -39,6 +39,14 @@ true + + + io.dropwizard.metrics + metrics-jmx + provided + true + + org.junit.jupiter @@ -90,6 +98,14 @@ org.apache.maven.plugins maven-jar-plugin + + + + + test-jar + + + diff --git a/libre2-dropwizard/src/main/java/com/axonops/libre2/dropwizard/RE2MetricsConfig.java b/libre2-core/src/main/java/com/axonops/libre2/dropwizard/RE2MetricsConfig.java similarity index 100% rename from libre2-dropwizard/src/main/java/com/axonops/libre2/dropwizard/RE2MetricsConfig.java rename to libre2-core/src/main/java/com/axonops/libre2/dropwizard/RE2MetricsConfig.java diff --git a/libre2-dropwizard/src/test/java/com/axonops/libre2/dropwizard/JmxIntegrationTest.java b/libre2-core/src/test/java/com/axonops/libre2/dropwizard/JmxIntegrationTest.java similarity index 100% rename from libre2-dropwizard/src/test/java/com/axonops/libre2/dropwizard/JmxIntegrationTest.java rename to libre2-core/src/test/java/com/axonops/libre2/dropwizard/JmxIntegrationTest.java diff --git a/libre2-dropwizard/src/test/java/com/axonops/libre2/dropwizard/MetricsEndToEndTest.java b/libre2-core/src/test/java/com/axonops/libre2/dropwizard/MetricsEndToEndTest.java similarity index 100% rename from libre2-dropwizard/src/test/java/com/axonops/libre2/dropwizard/MetricsEndToEndTest.java rename to libre2-core/src/test/java/com/axonops/libre2/dropwizard/MetricsEndToEndTest.java diff --git a/libre2-dropwizard/src/test/java/com/axonops/libre2/dropwizard/RE2MetricsConfigTest.java b/libre2-core/src/test/java/com/axonops/libre2/dropwizard/RE2MetricsConfigTest.java similarity index 100% rename from libre2-dropwizard/src/test/java/com/axonops/libre2/dropwizard/RE2MetricsConfigTest.java rename to libre2-core/src/test/java/com/axonops/libre2/dropwizard/RE2MetricsConfigTest.java diff --git a/libre2-dropwizard/README.md b/libre2-dropwizard/README.md deleted file mode 100644 index 3342fe1..0000000 --- a/libre2-dropwizard/README.md +++ /dev/null @@ -1,273 +0,0 @@ -# libre2-dropwizard - -**Dropwizard Metrics integration with automatic JMX** - ---- - -## Overview - -The `libre2-dropwizard` module provides convenient integration with Dropwizard Metrics, including automatic JMX exposure. Use this module if your application already uses Dropwizard Metrics (Cassandra, Spring Boot, Dropwizard apps, etc.). - ---- - -## Installation - -```xml - - com.axonops - libre2-dropwizard - 0.9.1 - - -``` - ---- - -## Quick Start - -### Basic Usage - -```java -import com.axonops.libre2.dropwizard.RE2MetricsConfig; -import com.axonops.libre2.api.Pattern; -import com.axonops.libre2.cache.PatternCache; -import com.codahale.metrics.MetricRegistry; - -// Get your application's MetricRegistry -MetricRegistry registry = getYourMetricRegistry(); - -// Create RE2 config with metrics (choose your prefix) -RE2Config config = RE2MetricsConfig.withMetrics(registry, "com.myapp.regex"); - -// Set as global cache -Pattern.setGlobalCache(new PatternCache(config)); - -// Use normally -Pattern pattern = Pattern.compile("test.*"); -Matcher matcher = pattern.matcher("test123"); -boolean matches = matcher.matches(); - -// All 21 metrics now in your MetricRegistry + JMX automatically -``` - ---- - -## Integration Examples - -### Cassandra Integration - -```java -import com.axonops.libre2.dropwizard.RE2MetricsConfig; - -// Get Cassandra's MetricRegistry -MetricRegistry cassandraRegistry = getCassandraMetricRegistry(); - -// Use convenience method (sets standard Cassandra prefix) -RE2Config config = RE2MetricsConfig.forCassandra(cassandraRegistry); -Pattern.setGlobalCache(new PatternCache(config)); - -// Metrics appear under: org.apache.cassandra.metrics.RE2.* -// Visible via: nodetool, JConsole, Prometheus JMX exporter -``` - -### Spring Boot Integration - -```java -import com.axonops.libre2.dropwizard.RE2MetricsConfig; -import org.springframework.beans.factory.annotation.Autowired; - -@Autowired -private MeterRegistry meterRegistry; // Spring's registry - -// Convert Spring's MeterRegistry to Dropwizard (if needed) -// Or if Spring Boot uses Dropwizard metrics directly: -MetricRegistry dropwizardRegistry = getDropwizardRegistry(); - -RE2Config config = RE2MetricsConfig.withMetrics(dropwizardRegistry, "com.mycompany.myapp.regex"); -Pattern.setGlobalCache(new PatternCache(config)); -``` - -### Standalone Application - -```java -import com.axonops.libre2.dropwizard.RE2MetricsConfig; -import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.jmx.JmxReporter; - -// Create your own registry -MetricRegistry registry = new MetricRegistry(); - -// Configure RE2 -RE2Config config = RE2MetricsConfig.withMetrics(registry, "com.mycompany.appname.re2"); -Pattern.setGlobalCache(new PatternCache(config)); - -// Metrics automatically exposed via JMX (RE2MetricsConfig handles this) -// Or manually configure JmxReporter: -JmxReporter jmxReporter = JmxReporter.forRegistry(registry).build(); -jmxReporter.start(); -``` - ---- - -## Metric Prefix Configuration - -The metric prefix determines where your metrics appear in JMX and monitoring systems: - -| Framework | Recommended Prefix | JMX ObjectName Example | -|-----------|-------------------|------------------------| -| Cassandra | `org.apache.cassandra.metrics.RE2` | `org.apache.cassandra.metrics:type=RE2,name=patterns.compiled` | -| Spring Boot | `com.mycompany.myapp.regex` | `com.mycompany.myapp.regex:type=patterns,name=compiled` | -| Standalone | `com.axonops.libre2` | `com.axonops.libre2:type=patterns,name=compiled` | -| Custom | Whatever you want | Depends on your prefix | - ---- - -## The 21 Metrics - -All metrics are automatically registered in your MetricRegistry: - -### Pattern Compilation (5 metrics) -- `patterns.compiled` (Counter) - Total patterns compiled -- `patterns.cache_hits` (Counter) - Cache hit count -- `patterns.cache_misses` (Counter) - Cache miss count -- `patterns.compilation_time` (Timer) - Compilation latency -- `patterns.invalid_recompiled` (Counter) - Auto-recompiled patterns - -### Cache Eviction (6 metrics) -- `cache.evictions_lru` (Counter) - LRU evictions -- `cache.evictions_idle` (Counter) - Idle evictions -- `cache.evictions_deferred` (Counter) - Deferred evictions -- `cache.size` (Gauge) - Current cache size -- `cache.native_memory_bytes` (Gauge) - Off-heap memory used -- `cache.native_memory_peak_bytes` (Gauge) - Peak memory - -### Resource Management (4 metrics) -- `resources.patterns_active` (Gauge) - Active patterns -- `resources.matchers_active` (Gauge) - Active matchers -- `resources.patterns_freed` (Counter) - Patterns freed -- `resources.matchers_freed` (Counter) - Matchers freed - -### Performance (3 metrics) -- `matching.full_match` (Timer) - Full match latency -- `matching.partial_match` (Timer) - Partial match latency -- `matching.operations` (Counter) - Total matches - -### Errors (3 metrics) -- `errors.compilation_failed` (Counter) - Compilation failures -- `errors.native_library` (Counter) - Native library errors -- `errors.resource_exhausted` (Counter) - Resource limits hit - ---- - -## JMX Monitoring - -Once configured, metrics are accessible via JMX: - -### Using JConsole - -1. Connect to your application's JMX port -2. Navigate to MBeans tab -3. Find your metric prefix (e.g., `com.myapp.regex` or `org.apache.cassandra.metrics.RE2`) -4. Expand to see all 21 metrics - -### Using Command Line (jmxterm) - -```bash -# Install jmxterm -brew install jmxterm - -# Connect and query -java -jar jmxterm.jar -l localhost:7199 -> domains -> domain com.axonops.libre2 -> beans -> get -b com.axonops.libre2:type=patterns,name=compiled -``` - -### Cassandra-Specific Monitoring - -```bash -# Using nodetool -nodetool sjk mx -q 'org.apache.cassandra.metrics:type=RE2,*' - -# View all RE2 metrics -nodetool sjk mx -b org.apache.cassandra.metrics:type=RE2,name=patterns.compiled - -# Watch cache size -watch -n 5 "nodetool sjk mx -b org.apache.cassandra.metrics:type=RE2,name=cache.size" -``` - ---- - -## Logging Configuration - -All logs are produced via SLF4J. See [LOGGING_GUIDE.md](../LOGGING_GUIDE.md) for comprehensive documentation. - -### Quick Configuration - -**Logback (Cassandra, Spring Boot):** -```xml - - - -``` - -**Log4j2:** -```xml - - - - - -``` - -All logs are prefixed with "RE2:" for easy filtering. - ---- - -## API - -### RE2MetricsConfig - -**Main factory class for creating RE2Config with metrics:** - -```java -// Generic (specify your prefix): -RE2Config withMetrics(MetricRegistry registry, String metricPrefix) -RE2Config withMetrics(MetricRegistry registry, String metricPrefix, boolean enableJmx) - -// Default prefix (com.axonops.libre2): -RE2Config withMetrics(MetricRegistry registry) - -// Cassandra convenience (uses org.apache.cassandra.metrics.RE2): -RE2Config forCassandra(MetricRegistry cassandraRegistry) -``` - ---- - -## Examples - -See [examples/](../examples/) directory for complete examples: -- Standalone application -- Cassandra integration -- Spring Boot integration -- Custom monitoring setup - ---- - -## Requirements - -- Java 17+ -- Dropwizard Metrics 4.2.x (provided by your application) -- libre2-core 0.9.1 - ---- - -## Performance - -**Metrics overhead:** < 1% -- Counter increment: ~10-20ns -- Timer recording: ~50-100ns -- Gauge registration: one-time cost - -**Zero overhead option:** Don't use this module, just use `libre2-core` with default NoOp metrics. diff --git a/libre2-dropwizard/pom.xml b/perf-test/pom.xml similarity index 66% rename from libre2-dropwizard/pom.xml rename to perf-test/pom.xml index 250c842..e5f9c4a 100644 --- a/libre2-dropwizard/pom.xml +++ b/perf-test/pom.xml @@ -10,26 +10,28 @@ 1.0.0 - libre2-dropwizard + perf-test jar - libre2-dropwizard - Dropwizard Metrics integration with automatic JMX - usable with any framework (Cassandra, Spring Boot, etc.) - - + perf-test + Performance and stress tests for libre2-java - + com.axonops libre2-core ${project.version} + test - + - io.dropwizard.metrics - metrics-jmx + com.axonops + libre2-core + ${project.version} + test-jar + test @@ -62,6 +64,14 @@ org.apache.maven.plugins maven-surefire-plugin + + + 600 + + + --add-exports=java.base/sun.nio.ch=ALL-UNNAMED + + diff --git a/libre2-core/src/test/java/com/axonops/libre2/api/BulkMatchingPerformanceTest.java b/perf-test/src/test/java/com/axonops/libre2/performance/BulkMatchingPerformanceTest.java similarity index 98% rename from libre2-core/src/test/java/com/axonops/libre2/api/BulkMatchingPerformanceTest.java rename to perf-test/src/test/java/com/axonops/libre2/performance/BulkMatchingPerformanceTest.java index 9515671..e5ab8c7 100644 --- a/libre2-core/src/test/java/com/axonops/libre2/api/BulkMatchingPerformanceTest.java +++ b/perf-test/src/test/java/com/axonops/libre2/performance/BulkMatchingPerformanceTest.java @@ -13,8 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.axonops.libre2.api; +package com.axonops.libre2.performance; +import com.axonops.libre2.api.Pattern; import com.axonops.libre2.cache.PatternCache; import com.axonops.libre2.test.TestUtils; import org.junit.jupiter.api.AfterAll; diff --git a/libre2-core/src/test/java/com/axonops/libre2/cache/CachePerformanceTest.java b/perf-test/src/test/java/com/axonops/libre2/performance/CachePerformanceTest.java similarity index 98% rename from libre2-core/src/test/java/com/axonops/libre2/cache/CachePerformanceTest.java rename to perf-test/src/test/java/com/axonops/libre2/performance/CachePerformanceTest.java index e3f88c5..1d76522 100644 --- a/libre2-core/src/test/java/com/axonops/libre2/cache/CachePerformanceTest.java +++ b/perf-test/src/test/java/com/axonops/libre2/performance/CachePerformanceTest.java @@ -1,7 +1,9 @@ -package com.axonops.libre2.cache; +package com.axonops.libre2.performance; import com.axonops.libre2.api.Matcher; import com.axonops.libre2.api.Pattern; +import com.axonops.libre2.cache.CacheStatistics; +import com.axonops.libre2.cache.RE2Config; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/libre2-core/src/test/java/com/axonops/libre2/cache/ConcurrencyTest.java b/perf-test/src/test/java/com/axonops/libre2/stress/ConcurrencyTest.java similarity index 99% rename from libre2-core/src/test/java/com/axonops/libre2/cache/ConcurrencyTest.java rename to perf-test/src/test/java/com/axonops/libre2/stress/ConcurrencyTest.java index b09eb26..78395cc 100644 --- a/libre2-core/src/test/java/com/axonops/libre2/cache/ConcurrencyTest.java +++ b/perf-test/src/test/java/com/axonops/libre2/stress/ConcurrencyTest.java @@ -1,7 +1,8 @@ -package com.axonops.libre2.cache; +package com.axonops.libre2.stress; import com.axonops.libre2.api.Matcher; import com.axonops.libre2.api.Pattern; +import com.axonops.libre2.cache.CacheStatistics; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/libre2-core/src/test/java/com/axonops/libre2/cache/ConcurrentCleanupTest.java b/perf-test/src/test/java/com/axonops/libre2/stress/ConcurrentCleanupTest.java similarity index 98% rename from libre2-core/src/test/java/com/axonops/libre2/cache/ConcurrentCleanupTest.java rename to perf-test/src/test/java/com/axonops/libre2/stress/ConcurrentCleanupTest.java index fe04f7d..9777a8a 100644 --- a/libre2-core/src/test/java/com/axonops/libre2/cache/ConcurrentCleanupTest.java +++ b/perf-test/src/test/java/com/axonops/libre2/stress/ConcurrentCleanupTest.java @@ -1,6 +1,7 @@ -package com.axonops.libre2.cache; +package com.axonops.libre2.stress; import com.axonops.libre2.api.Pattern; +import com.axonops.libre2.cache.CacheStatistics; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Timeout; diff --git a/libre2-core/src/test/java/com/axonops/libre2/cache/EvictionWhileInUseTest.java b/perf-test/src/test/java/com/axonops/libre2/stress/EvictionWhileInUseTest.java similarity index 99% rename from libre2-core/src/test/java/com/axonops/libre2/cache/EvictionWhileInUseTest.java rename to perf-test/src/test/java/com/axonops/libre2/stress/EvictionWhileInUseTest.java index 54a8a30..c14241b 100644 --- a/libre2-core/src/test/java/com/axonops/libre2/cache/EvictionWhileInUseTest.java +++ b/perf-test/src/test/java/com/axonops/libre2/stress/EvictionWhileInUseTest.java @@ -1,4 +1,4 @@ -package com.axonops.libre2.cache; +package com.axonops.libre2.stress; import com.axonops.libre2.api.Matcher; import com.axonops.libre2.api.Pattern; diff --git a/libre2-core/src/test/java/com/axonops/libre2/cache/StressTest.java b/perf-test/src/test/java/com/axonops/libre2/stress/StressTest.java similarity index 98% rename from libre2-core/src/test/java/com/axonops/libre2/cache/StressTest.java rename to perf-test/src/test/java/com/axonops/libre2/stress/StressTest.java index 545aaf3..8596a14 100644 --- a/libre2-core/src/test/java/com/axonops/libre2/cache/StressTest.java +++ b/perf-test/src/test/java/com/axonops/libre2/stress/StressTest.java @@ -1,7 +1,8 @@ -package com.axonops.libre2.cache; +package com.axonops.libre2.stress; import com.axonops.libre2.api.Matcher; import com.axonops.libre2.api.Pattern; +import com.axonops.libre2.cache.CacheStatistics; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/pom.xml b/pom.xml index 036d492..ce3df30 100644 --- a/pom.xml +++ b/pom.xml @@ -15,7 +15,7 @@ libre2-core - libre2-dropwizard + perf-test