@@ -2325,6 +2325,8 @@ class _LCSUBAutomaton:
23252325
23262326 Examples:
23272327 >>> aut = _LCSUBAutomaton('abc')
2328+ >>> aut
2329+ <_LCSUBAutomaton object; seq2_size=3>
23282330 >>> aut.build()
23292331 >>> aut.print_states()
23302332 0 (0, 0, {'a': 1, 'b': 2, 'c': 3}, -3, 0)
@@ -2349,6 +2351,12 @@ def __init__(self, seq2, *, junk=()):
23492351 self .nodes = None
23502352 self .cache = (0 , 0 )
23512353
2354+ def __repr__ (self ):
2355+ kwstring = f'seq2_size={ self .size2 } '
2356+ if self .junk :
2357+ kwstring += f', junk_size={ len (self .junk )} '
2358+ return f'<{ type (self ).__name__ } object; { kwstring } >'
2359+
23522360 # API -----------------------------
23532361 # ---------------------------------
23542362
@@ -2748,10 +2756,34 @@ def _calc_skew(i, j, k, alo, ahi, blo, bhi):
27482756
27492757
27502758class GestaltSequenceMatcher (SequenceMatcherBase ):
2759+ """
2760+ GestaltSequenceMatcher is a flexible class for comparing pairs
2761+ of sequences of any type, so long as the sequence elements are hashable.
2762+
2763+ It builds upon the same idea as `SequenceMatcher` and with its defaults
2764+ its results are exactly the same as the ones of `SequenceMatcher` with
2765+ `autojunk` parameter set to False.
2766+
2767+ However, while `SequenceMatcher` is able to obtain same result,
2768+ it is only practical to use with `autojunk` set to False due to
2769+ quadratic worst case complexity.
2770+
2771+ `GestaltSequenceMatcher`, on the other hand, implements Suffix Automaton,
2772+ which has guaranteed O(n) complexity, making it possible to use exact
2773+ calculation on long sequences.
2774+
2775+ Furthermore, `GestaltSequenceMatcher` has `balancing` parameter.
2776+ By default it is turned off, but it can be turned on to desired level to
2777+ reduce the chance of greedily committing to unbalanced matches.
2778+ It does so by sometimes selecting shorter matches by lookin 1 step ahead.
2779+ It produces more concise diffs with more lines matched, while retaining
2780+ block-oriented nature.
2781+ """
2782+
27512783 def __init__ (self , isjunk = None , a = '' , b = '' , balancing = 0 ):
27522784 """
27532785 Args:
2754- balancing : float
2786+ balancing : float in [0, 1]
27552787 a ratio that specifies the proportion of `skew` for which
27562788 balancing action will be attempted.
27572789 if 0, no balancing actions will occur
@@ -2767,9 +2799,16 @@ def __init__(self, isjunk=None, a='', b='', balancing=0):
27672799 In terms of results, the following 2 are equivalent:
27682800 a) SequenceMatcher(isjunk=None, autojunk=False)
27692801 b) GestaltSequenceMatcher(isjunk=None, balancing=0)
2770- When isjunk is not None, there is a chance of small differences
2771- due to the fact that SequenceMatcher expands junk for
2772- no match, while GestaltSequenceMatcher does not
2802+
2803+ Examples:
2804+ >>> seq1 = 'aaaa_aaaa_bbbbb'
2805+ >>> seq2 = 'bbbbb-aaaa-aaaa'
2806+ >>> m1 = GestaltSequenceMatcher(None, seq1, seq2)
2807+ >>> m2 = GestaltSequenceMatcher(None, seq1, seq2, balancing=2/3)
2808+ >>> list(map(tuple, m1.get_matching_blocks()))
2809+ [(10, 0, 5), (15, 15, 0)]
2810+ >>> list(map(tuple, m2.get_matching_blocks()))
2811+ [(0, 6, 4), (5, 11, 4), (15, 15, 0)]
27732812 """
27742813 balancing = float (balancing )
27752814 if not 0 <= balancing <= 1 :
@@ -2786,6 +2825,15 @@ def _prepare_b(self):
27862825 self .automaton = _LCSUBAutomaton (b , junk = bjunk )
27872826
27882827 def find_longest_match (self , alo = 0 , ahi = None , blo = 0 , bhi = None , * , quick_only = False ):
2828+ """Find longest matching block in a[alo:ahi] and b[blo:bhi].
2829+ By default it will find the longest match in the entirety of a and b.
2830+
2831+ Look up docstring of SequenceMatcher.find_longest_match
2832+ for more information.
2833+
2834+ The only difference is `quick_only` argument, which if set to True
2835+ might not return a value if not possible with current build
2836+ """
27892837 a , b , bjunk = self .a , self .b , self .bjunk
27902838 automaton = self .automaton
27912839 func = automaton ._try_find if quick_only else automaton .find
@@ -2794,14 +2842,22 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None, *, quick_only=Fal
27942842 # For quick_only=True it might not return anything
27952843 return
27962844
2797- if block [ 2 ] and bjunk :
2845+ if bjunk :
27982846 # Extend match to surrounding junk
2847+ # [2026-02-07@dgpb]: Note, expanding will happen even when no-match
27992848 block = _expand_block_to_junk (
28002849 bjunk , block , a , b , alo , ahi , blo , bhi , inverse = False )
28012850
28022851 return Match ._make (block )
28032852
28042853 def batch_find_longest_match (self , bounds_list ):
2854+ """Performance method for many `find_longest_match` calls
2855+ It calls `find` in order that aims to minimize builds needed
2856+ Also, does not evaluate same range twice
2857+ Args:
2858+ bounds_list : list[tuple[int, int, int, int]]
2859+ list of tuples: (alo, ahi, blo, bhi)
2860+ """
28052861 a , b , bjunk = self .a , self .b , self .bjunk
28062862 bounds_list = list (bounds_list )
28072863 block_list = self .automaton .batchfind (a , bounds_list )
@@ -2981,7 +3037,7 @@ def _get_matching_blocks(self):
29813037 for triple in triples :
29823038 triple [0 ] = job_results [triple [0 ]]
29833039 triple [2 ] = job_results [triple [2 ]]
2984- # k**1.3 is empirically tuned
3040+ # NOTE: k**1.3 is empirically tuned
29853041 # prefers one long match to many small ones
29863042 # but not too aggressively, so to be able to jump
29873043 # out of skewed positions
0 commit comments