@@ -2318,7 +2318,7 @@ class _LCSUBAutomaton:
23182318 eposs: list[int], # index of last match position
23192319 ]
23202320
2321- Next logic:
2321+ Next logic (Memory optimization as > 50% of state have only 1 transition) :
23222322 next2 == -1 -> empty
23232323 next2 == -3 -> next1: dict
23242324 next2 >= 0 -> next2 - index, next1 - key
@@ -2374,7 +2374,7 @@ def build(self, start2=0, stop2=None):
23742374
23752375 def findall (self , seq1 , start1 = 0 , stop1 = None , start2 = 0 , stop2 = None , * ,
23762376 mink = 1 , maxk = None , maximal = False ):
2377- """
2377+ """Find all common substrings from single O(n) scan
23782378 Args:
23792379 mink : int
23802380 filter out shorter length matches
@@ -2528,12 +2528,13 @@ def _make_nodes(self, n):
25282528 return lengths , links , next1s , next2s , eposs
25292529
25302530 def _build (self , start2 , stop2 ):
2531+ """Automaton builder"""
25312532 seq2 = self .seq2
25322533 junk = self .junk
25332534 # Make Nodes
25342535 size = (stop2 - start2 )
2535- n_nodes = 4 * size // 3 + 1
2536- inc = size // 10 + 1
2536+ n_nodes = 4 * size // 3 + 1 # Maximum 25% overallocation
2537+ inc = size // 10 + 1 # Then, 10% increments
25372538 nodes = self ._make_nodes (n_nodes )
25382539 lengths , links , next1s , next2s , eposs = nodes
25392540 nstates = 1
@@ -2645,12 +2646,18 @@ def _build(self, start2, stop2):
26452646 return nodes
26462647
26472648 def _finditer (self , seq1 , start1 , stop1 , best = False ):
2648- """
2649+ """Core scanning routine
26492650 Args:
26502651 best : bool
26512652 False - return all matches, including non-maximal
26522653 True - return all matches of maximum length
26532654 all these will naturally be maximal
2655+ Returns:
2656+ generator of tuples (e1, e2, k), where
2657+ e1, e2 are ending positions in seq1 and seq2 respectively
2658+ k is length of a match
2659+ Thus, starting position is: e1 + 1 - k
2660+ And stop for a slice is: e1 + 1
26542661 """
26552662 if best not in (0 , 1 ):
26562663 raise ValueError (f'{ best = } not in (0, 1)' )
@@ -2731,7 +2738,9 @@ def __repr__(self):
27312738
27322739
27332740def _calc_skew (i , j , k , alo , ahi , blo , bhi ):
2734- # NOTE: -1 <= skew <= 1
2741+ """Difference in normalized positions
2742+ Returns skew : float, where -1 < skew < 1
2743+ """
27352744 k_div_2 = k // 2
27362745 apos = (i + k_div_2 - alo ) / (ahi - alo )
27372746 bpos = (j + k_div_2 - blo ) / (bhi - blo )
@@ -2836,6 +2845,9 @@ def _modifier(self, depth, block, alo, ahi, blo, bhi):
28362845 RESULTBLOCKS - List of blocks that terminate recursion
28372846 e.g. (RESULTBLOCKS, [(0, 0, 10), (10, 10, 10)])
28382847
2848+ If data contains no blocks or only blocks of 0 length,
2849+ the algorithm does not recurse further.
2850+
28392851 Note, one can get `a`, `b`, `automaton`, etc from self
28402852 """
28412853 pass
@@ -2908,10 +2920,11 @@ def _get_matching_blocks(self):
29082920 i0 = i + k
29092921 j0 = j + k
29102922
2923+ if not validated :
2924+ continue
2925+
29112926 # 2.1.3. Apply action
29122927 if mtype is REPLACEBLOCK :
2913- if not validated :
2914- continue
29152928 i , j , k = init_block = validated [0 ]
29162929
29172930 elif mtype is ANCHORBLOCKS :
@@ -2968,6 +2981,10 @@ def _get_matching_blocks(self):
29682981 for triple in triples :
29692982 triple [0 ] = job_results [triple [0 ]]
29702983 triple [2 ] = job_results [triple [2 ]]
2984+ # k**1.3 is empirically tuned
2985+ # prefers one long match to many small ones
2986+ # but not too aggressively, so to be able to jump
2987+ # out of skewed positions
29712988 total = sum (t [2 ]** 1.3 for t in triple )
29722989 skew = _calc_skew (* triple [1 ], * bounds )
29732990 triple .append ((total , - abs (skew )))
@@ -2988,6 +3005,9 @@ def _get_matching_blocks(self):
29883005 q_tail .append ((_RANGE , (i0 , ii , j0 , jj )))
29893006 q_tail .append ((_BLOCK , block ))
29903007 i0 , j0 = ii + kk , jj + kk
3008+ if not q_tail :
3009+ # No blocks identified. Do not recurse further.
3010+ continue
29913011 q_tail .append ((_RANGE , (i0 , ahi , j0 , bhi )))
29923012
29933013 # 3.2. Yield what is possible straight away
0 commit comments