Skip to content

Commit 568b0ef

Browse files
committed
documentation and tests
1 parent 70d501a commit 568b0ef

File tree

2 files changed

+189
-6
lines changed

2 files changed

+189
-6
lines changed

Lib/difflib.py

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2325,6 +2325,8 @@ class _LCSUBAutomaton:
23252325
23262326
Examples:
23272327
>>> aut = _LCSUBAutomaton('abc')
2328+
>>> aut
2329+
<_LCSUBAutomaton object; seq2_size=3>
23282330
>>> aut.build()
23292331
>>> aut.print_states()
23302332
0 (0, 0, {'a': 1, 'b': 2, 'c': 3}, -3, 0)
@@ -2349,6 +2351,12 @@ def __init__(self, seq2, *, junk=()):
23492351
self.nodes = None
23502352
self.cache = (0, 0)
23512353

2354+
def __repr__(self):
2355+
kwstring = f'seq2_size={self.size2}'
2356+
if self.junk:
2357+
kwstring += f', junk_size={len(self.junk)}'
2358+
return f'<{type(self).__name__} object; {kwstring}>'
2359+
23522360
# API -----------------------------
23532361
# ---------------------------------
23542362

@@ -2748,10 +2756,34 @@ def _calc_skew(i, j, k, alo, ahi, blo, bhi):
27482756

27492757

27502758
class GestaltSequenceMatcher(SequenceMatcherBase):
2759+
"""
2760+
GestaltSequenceMatcher is a flexible class for comparing pairs
2761+
of sequences of any type, so long as the sequence elements are hashable.
2762+
2763+
It builds upon the same idea as `SequenceMatcher` and with its defaults
2764+
its results are exactly the same as the ones of `SequenceMatcher` with
2765+
`autojunk` parameter set to False.
2766+
2767+
However, while `SequenceMatcher` is able to obtain same result,
2768+
it is only practical to use with `autojunk` set to False due to
2769+
quadratic worst case complexity.
2770+
2771+
`GestaltSequenceMatcher`, on the other hand, implements Suffix Automaton,
2772+
which has guaranteed O(n) complexity, making it possible to use exact
2773+
calculation on long sequences.
2774+
2775+
Furthermore, `GestaltSequenceMatcher` has `balancing` parameter.
2776+
By default it is turned off, but it can be turned on to desired level to
2777+
reduce the chance of greedily committing to unbalanced matches.
2778+
It does so by sometimes selecting shorter matches by lookin 1 step ahead.
2779+
It produces more concise diffs with more lines matched, while retaining
2780+
block-oriented nature.
2781+
"""
2782+
27512783
def __init__(self, isjunk=None, a='', b='', balancing=0):
27522784
"""
27532785
Args:
2754-
balancing : float
2786+
balancing : float in [0, 1]
27552787
a ratio that specifies the proportion of `skew` for which
27562788
balancing action will be attempted.
27572789
if 0, no balancing actions will occur
@@ -2767,9 +2799,16 @@ def __init__(self, isjunk=None, a='', b='', balancing=0):
27672799
In terms of results, the following 2 are equivalent:
27682800
a) SequenceMatcher(isjunk=None, autojunk=False)
27692801
b) GestaltSequenceMatcher(isjunk=None, balancing=0)
2770-
When isjunk is not None, there is a chance of small differences
2771-
due to the fact that SequenceMatcher expands junk for
2772-
no match, while GestaltSequenceMatcher does not
2802+
2803+
Examples:
2804+
>>> seq1 = 'aaaa_aaaa_bbbbb'
2805+
>>> seq2 = 'bbbbb-aaaa-aaaa'
2806+
>>> m1 = GestaltSequenceMatcher(None, seq1, seq2)
2807+
>>> m2 = GestaltSequenceMatcher(None, seq1, seq2, balancing=2/3)
2808+
>>> list(map(tuple, m1.get_matching_blocks()))
2809+
[(10, 0, 5), (15, 15, 0)]
2810+
>>> list(map(tuple, m2.get_matching_blocks()))
2811+
[(0, 6, 4), (5, 11, 4), (15, 15, 0)]
27732812
"""
27742813
balancing = float(balancing)
27752814
if not 0 <= balancing <= 1:
@@ -2786,6 +2825,15 @@ def _prepare_b(self):
27862825
self.automaton = _LCSUBAutomaton(b, junk=bjunk)
27872826

27882827
def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None, *, quick_only=False):
2828+
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
2829+
By default it will find the longest match in the entirety of a and b.
2830+
2831+
Look up docstring of SequenceMatcher.find_longest_match
2832+
for more information.
2833+
2834+
The only difference is `quick_only` argument, which if set to True
2835+
might not return a value if not possible with current build
2836+
"""
27892837
a, b, bjunk = self.a, self.b, self.bjunk
27902838
automaton = self.automaton
27912839
func = automaton._try_find if quick_only else automaton.find
@@ -2794,14 +2842,22 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None, *, quick_only=Fal
27942842
# For quick_only=True it might not return anything
27952843
return
27962844

2797-
if block[2] and bjunk:
2845+
if bjunk:
27982846
# Extend match to surrounding junk
2847+
# [2026-02-07@dgpb]: Note, expanding will happen even when no-match
27992848
block = _expand_block_to_junk(
28002849
bjunk, block, a, b, alo, ahi, blo, bhi, inverse=False)
28012850

28022851
return Match._make(block)
28032852

28042853
def batch_find_longest_match(self, bounds_list):
2854+
"""Performance method for many `find_longest_match` calls
2855+
It calls `find` in order that aims to minimize builds needed
2856+
Also, does not evaluate same range twice
2857+
Args:
2858+
bounds_list : list[tuple[int, int, int, int]]
2859+
list of tuples: (alo, ahi, blo, bhi)
2860+
"""
28052861
a, b, bjunk = self.a, self.b, self.bjunk
28062862
bounds_list = list(bounds_list)
28072863
block_list = self.automaton.batchfind(a, bounds_list)
@@ -2981,7 +3037,7 @@ def _get_matching_blocks(self):
29813037
for triple in triples:
29823038
triple[0] = job_results[triple[0]]
29833039
triple[2] = job_results[triple[2]]
2984-
# k**1.3 is empirically tuned
3040+
# NOTE: k**1.3 is empirically tuned
29853041
# prefers one long match to many small ones
29863042
# but not too aggressively, so to be able to jump
29873043
# out of skewed positions

Lib/test/test_difflib.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,133 @@ def test_invalid_input(self):
640640
''.join(difflib.restore([], 3))
641641

642642

643+
class TestLCSUBAutomaton(unittest.TestCase):
644+
def test_find(self):
645+
cases = [
646+
('abd', 'abcabd', (0, 3, 3)),
647+
('dab', 'abcabd', (1, 0, 2)),
648+
]
649+
collect = []
650+
for seq1, seq2, expect in cases:
651+
result = difflib._LCSUBAutomaton(seq2).find(seq1)
652+
self.assertEqual(result, expect)
653+
collect.append(result)
654+
655+
def test_find_with_junk(self):
656+
cases = [
657+
('ab_abd', 'abcabd', (3, 3, 3)),
658+
('abd_', 'ab_abd_', (0, 3, 3)),
659+
('abcbd', 'abc_bd', (0, 0, 3)),
660+
('cbd', 'abc_bd', (1, 4, 2)),
661+
]
662+
for seq1, seq2, expect in cases:
663+
result = difflib._LCSUBAutomaton(seq2, junk=('_')).find(seq1)
664+
self.assertEqual(result, expect)
665+
666+
def test_findall(self):
667+
seq1 = 'defabc'
668+
aut = difflib._LCSUBAutomaton('abcdef')
669+
result = [seq1[i:i+k] for i, j, k in aut.findall(seq1)]
670+
self.assertEqual(result, ['d', 'de', 'def', 'a', 'ab', 'abc'])
671+
result = [seq1[i:i+k] for i, j, k in aut.findall(seq1, maximal=True)]
672+
self.assertEqual(result, ['def', 'abc'])
673+
result = [seq1[i:i+k] for i, j, k in aut.findall(seq1, mink=2)]
674+
self.assertEqual(result, ['de', 'def', 'ab', 'abc'])
675+
result = [seq1[i:i+k] for i, j, k in aut.findall(seq1, maxk=2)]
676+
self.assertEqual(result, ['d', 'de', 'a', 'ab'])
677+
678+
def test_batchfind(self):
679+
seq1 = 'fgfedabacba'
680+
seq2 = seq1[::-1]
681+
n = len(seq1)
682+
683+
intervals = []
684+
for i in range(n - 1):
685+
for j in range(i + 1, min(i + 5, n)):
686+
intervals.append((i, j))
687+
bounds_list = []
688+
for alo, ahi in intervals:
689+
for blo, bhi in intervals:
690+
bounds_list.append((alo, ahi, blo, bhi))
691+
692+
aut = difflib._LCSUBAutomaton(seq2)
693+
results1 = [aut.find(seq1, *bounds) for bounds in bounds_list]
694+
results2 = aut.batchfind(seq1, bounds_list)
695+
self.assertEqual(results1, results2)
696+
697+
698+
seq1_skew = """
699+
def foo1(a, b):
700+
a += 1
701+
b += 1
702+
return a + b
703+
704+
def foo2(a, b):
705+
a += 2
706+
b += 2
707+
return a + b
708+
709+
def foo3(a, b):
710+
c = a + b
711+
d = c + a * b
712+
r = sum(range(d))
713+
return r
714+
"""
715+
716+
717+
seq2_skew = """
718+
def foo3(a, b):
719+
c = a + b
720+
d = c + a * b
721+
r = sum(range(d))
722+
return r
723+
#
724+
def foo1(a, b):
725+
a += 1
726+
b += 1
727+
return a + b
728+
#
729+
def foo2(a, b):
730+
a += 2
731+
b += 2
732+
return a + b
733+
"""
734+
735+
736+
class TestGestaltSequenceMatcher(unittest.TestCase):
737+
def test_cross_test_with_autojunk_false(self):
738+
cases = [
739+
("ABCDEFGHIJKLMNOP" * 50, "ACEGIKMOQBDFHJLNP" * 50),
740+
(
741+
"".join(chr(ord('a') + i % 10) * (i + 1) for i in range(30)),
742+
"".join(chr(ord('a') + i % 10) * (30 - i) for i in range(30))
743+
),
744+
(
745+
"A" + "X"*99 + "BCDEFGHIJKLMNOPQRSTUVWXYZ"*2,
746+
"BCDEFGHIJKLMNOPQRSTUVWXYZ"*2 + "A" + "X"*99
747+
)
748+
]
749+
for seq1, seq2 in cases:
750+
for isjunk in [None, lambda x: x in 'aeAE']:
751+
sm1 = difflib.SequenceMatcher(isjunk, seq1, seq2, autojunk=False)
752+
sm2 = difflib.GestaltSequenceMatcher(isjunk, seq1, seq2)
753+
self.assertEqual(sm1.bjunk, sm2.bjunk)
754+
blocks1 = sm1.get_matching_blocks()
755+
blocks2 = sm2.get_matching_blocks()
756+
self.assertEqual(blocks1, blocks2)
757+
self.assertAlmostEqual(sm1.ratio(), sm2.ratio(), places=3)
758+
759+
def test_balancing(self):
760+
seq1 = seq1_skew.strip().splitlines()
761+
seq2 = seq2_skew.strip().splitlines()
762+
sm1 = difflib.GestaltSequenceMatcher(None, seq1, seq2)
763+
sm2 = difflib.GestaltSequenceMatcher(None, seq1, seq2, balancing=2/3)
764+
blocks1 = list(map(tuple, sm1.get_matching_blocks()))
765+
blocks2 = list(map(tuple, sm2.get_matching_blocks()))
766+
self.assertEqual(blocks1, [(10, 0, 5), (15, 15, 0)])
767+
self.assertEqual(blocks2, [(0, 6, 4), (5, 11, 4), (15, 15, 0)])
768+
769+
643770
def setUpModule():
644771
difflib.HtmlDiff._default_prefix = 0
645772

0 commit comments

Comments
 (0)