PyXR

c:\python24\lib \ difflib.py



0001 #! /usr/bin/env python
0002 
0003 """
0004 Module difflib -- helpers for computing deltas between objects.
0005 
0006 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
0007     Use SequenceMatcher to return list of the best "good enough" matches.
0008 
0009 Function context_diff(a, b):
0010     For two lists of strings, return a delta in context diff format.
0011 
0012 Function ndiff(a, b):
0013     Return a delta: the difference between `a` and `b` (lists of strings).
0014 
0015 Function restore(delta, which):
0016     Return one of the two sequences that generated an ndiff delta.
0017 
0018 Function unified_diff(a, b):
0019     For two lists of strings, return a delta in unified diff format.
0020 
0021 Class SequenceMatcher:
0022     A flexible class for comparing pairs of sequences of any type.
0023 
0024 Class Differ:
0025     For producing human-readable deltas from sequences of lines of text.
0026 
0027 Class HtmlDiff:
0028     For producing HTML side by side comparison with change highlights.
0029 """
0030 
0031 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
0032            'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
0033            'unified_diff', 'HtmlDiff']
0034 
0035 import heapq
0036 
0037 def _calculate_ratio(matches, length):
0038     if length:
0039         return 2.0 * matches / length
0040     return 1.0
0041 
0042 class SequenceMatcher:
0043 
0044     """
0045     SequenceMatcher is a flexible class for comparing pairs of sequences of
0046     any type, so long as the sequence elements are hashable.  The basic
0047     algorithm predates, and is a little fancier than, an algorithm
0048     published in the late 1980's by Ratcliff and Obershelp under the
0049     hyperbolic name "gestalt pattern matching".  The basic idea is to find
0050     the longest contiguous matching subsequence that contains no "junk"
0051     elements (R-O doesn't address junk).  The same idea is then applied
0052     recursively to the pieces of the sequences to the left and to the right
0053     of the matching subsequence.  This does not yield minimal edit
0054     sequences, but does tend to yield matches that "look right" to people.
0055 
0056     SequenceMatcher tries to compute a "human-friendly diff" between two
0057     sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
0058     longest *contiguous* & junk-free matching subsequence.  That's what
0059     catches peoples' eyes.  The Windows(tm) windiff has another interesting
0060     notion, pairing up elements that appear uniquely in each sequence.
0061     That, and the method here, appear to yield more intuitive difference
0062     reports than does diff.  This method appears to be the least vulnerable
0063     to synching up on blocks of "junk lines", though (like blank lines in
0064     ordinary text files, or maybe "<P>" lines in HTML files).  That may be
0065     because this is the only method of the 3 that has a *concept* of
0066     "junk" <wink>.
0067 
0068     Example, comparing two strings, and considering blanks to be "junk":
0069 
0070     >>> s = SequenceMatcher(lambda x: x == " ",
0071     ...                     "private Thread currentThread;",
0072     ...                     "private volatile Thread currentThread;")
0073     >>>
0074 
0075     .ratio() returns a float in [0, 1], measuring the "similarity" of the
0076     sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
0077     sequences are close matches:
0078 
0079     >>> print round(s.ratio(), 3)
0080     0.866
0081     >>>
0082 
0083     If you're only interested in where the sequences match,
0084     .get_matching_blocks() is handy:
0085 
0086     >>> for block in s.get_matching_blocks():
0087     ...     print "a[%d] and b[%d] match for %d elements" % block
0088     a[0] and b[0] match for 8 elements
0089     a[8] and b[17] match for 6 elements
0090     a[14] and b[23] match for 15 elements
0091     a[29] and b[38] match for 0 elements
0092 
0093     Note that the last tuple returned by .get_matching_blocks() is always a
0094     dummy, (len(a), len(b), 0), and this is the only case in which the last
0095     tuple element (number of elements matched) is 0.
0096 
0097     If you want to know how to change the first sequence into the second,
0098     use .get_opcodes():
0099 
0100     >>> for opcode in s.get_opcodes():
0101     ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
0102      equal a[0:8] b[0:8]
0103     insert a[8:8] b[8:17]
0104      equal a[8:14] b[17:23]
0105      equal a[14:29] b[23:38]
0106 
0107     See the Differ class for a fancy human-friendly file differencer, which
0108     uses SequenceMatcher both to compare sequences of lines, and to compare
0109     sequences of characters within similar (near-matching) lines.
0110 
0111     See also function get_close_matches() in this module, which shows how
0112     simple code building on SequenceMatcher can be used to do useful work.
0113 
0114     Timing:  Basic R-O is cubic time worst case and quadratic time expected
0115     case.  SequenceMatcher is quadratic time for the worst case and has
0116     expected-case behavior dependent in a complicated way on how many
0117     elements the sequences have in common; best case time is linear.
0118 
0119     Methods:
0120 
0121     __init__(isjunk=None, a='', b='')
0122         Construct a SequenceMatcher.
0123 
0124     set_seqs(a, b)
0125         Set the two sequences to be compared.
0126 
0127     set_seq1(a)
0128         Set the first sequence to be compared.
0129 
0130     set_seq2(b)
0131         Set the second sequence to be compared.
0132 
0133     find_longest_match(alo, ahi, blo, bhi)
0134         Find longest matching block in a[alo:ahi] and b[blo:bhi].
0135 
0136     get_matching_blocks()
0137         Return list of triples describing matching subsequences.
0138 
0139     get_opcodes()
0140         Return list of 5-tuples describing how to turn a into b.
0141 
0142     ratio()
0143         Return a measure of the sequences' similarity (float in [0,1]).
0144 
0145     quick_ratio()
0146         Return an upper bound on .ratio() relatively quickly.
0147 
0148     real_quick_ratio()
0149         Return an upper bound on ratio() very quickly.
0150     """
0151 
0152     def __init__(self, isjunk=None, a='', b=''):
0153         """Construct a SequenceMatcher.
0154 
0155         Optional arg isjunk is None (the default), or a one-argument
0156         function that takes a sequence element and returns true iff the
0157         element is junk.  None is equivalent to passing "lambda x: 0", i.e.
0158         no elements are considered to be junk.  For example, pass
0159             lambda x: x in " \\t"
0160         if you're comparing lines as sequences of characters, and don't
0161         want to synch up on blanks or hard tabs.
0162 
0163         Optional arg a is the first of two sequences to be compared.  By
0164         default, an empty string.  The elements of a must be hashable.  See
0165         also .set_seqs() and .set_seq1().
0166 
0167         Optional arg b is the second of two sequences to be compared.  By
0168         default, an empty string.  The elements of b must be hashable. See
0169         also .set_seqs() and .set_seq2().
0170         """
0171 
0172         # Members:
0173         # a
0174         #      first sequence
0175         # b
0176         #      second sequence; differences are computed as "what do
0177         #      we need to do to 'a' to change it into 'b'?"
0178         # b2j
0179         #      for x in b, b2j[x] is a list of the indices (into b)
0180         #      at which x appears; junk elements do not appear
0181         # fullbcount
0182         #      for x in b, fullbcount[x] == the number of times x
0183         #      appears in b; only materialized if really needed (used
0184         #      only for computing quick_ratio())
0185         # matching_blocks
0186         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
0187         #      ascending & non-overlapping in i and in j; terminated by
0188         #      a dummy (len(a), len(b), 0) sentinel
0189         # opcodes
0190         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
0191         #      one of
0192         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
0193         #          'delete'    a[i1:i2] should be deleted
0194         #          'insert'    b[j1:j2] should be inserted
0195         #          'equal'     a[i1:i2] == b[j1:j2]
0196         # isjunk
0197         #      a user-supplied function taking a sequence element and
0198         #      returning true iff the element is "junk" -- this has
0199         #      subtle but helpful effects on the algorithm, which I'll
0200         #      get around to writing up someday <0.9 wink>.
0201         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
0202         # isbjunk
0203         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
0204         #      it's really the has_key method of a hidden dict.
0205         #      DOES NOT WORK for x in a!
0206         # isbpopular
0207         #      for x in b, isbpopular(x) is true iff b is reasonably long
0208         #      (at least 200 elements) and x accounts for more than 1% of
0209         #      its elements.  DOES NOT WORK for x in a!
0210 
0211         self.isjunk = isjunk
0212         self.a = self.b = None
0213         self.set_seqs(a, b)
0214 
0215     def set_seqs(self, a, b):
0216         """Set the two sequences to be compared.
0217 
0218         >>> s = SequenceMatcher()
0219         >>> s.set_seqs("abcd", "bcde")
0220         >>> s.ratio()
0221         0.75
0222         """
0223 
0224         self.set_seq1(a)
0225         self.set_seq2(b)
0226 
0227     def set_seq1(self, a):
0228         """Set the first sequence to be compared.
0229 
0230         The second sequence to be compared is not changed.
0231 
0232         >>> s = SequenceMatcher(None, "abcd", "bcde")
0233         >>> s.ratio()
0234         0.75
0235         >>> s.set_seq1("bcde")
0236         >>> s.ratio()
0237         1.0
0238         >>>
0239 
0240         SequenceMatcher computes and caches detailed information about the
0241         second sequence, so if you want to compare one sequence S against
0242         many sequences, use .set_seq2(S) once and call .set_seq1(x)
0243         repeatedly for each of the other sequences.
0244 
0245         See also set_seqs() and set_seq2().
0246         """
0247 
0248         if a is self.a:
0249             return
0250         self.a = a
0251         self.matching_blocks = self.opcodes = None
0252 
0253     def set_seq2(self, b):
0254         """Set the second sequence to be compared.
0255 
0256         The first sequence to be compared is not changed.
0257 
0258         >>> s = SequenceMatcher(None, "abcd", "bcde")
0259         >>> s.ratio()
0260         0.75
0261         >>> s.set_seq2("abcd")
0262         >>> s.ratio()
0263         1.0
0264         >>>
0265 
0266         SequenceMatcher computes and caches detailed information about the
0267         second sequence, so if you want to compare one sequence S against
0268         many sequences, use .set_seq2(S) once and call .set_seq1(x)
0269         repeatedly for each of the other sequences.
0270 
0271         See also set_seqs() and set_seq1().
0272         """
0273 
0274         if b is self.b:
0275             return
0276         self.b = b
0277         self.matching_blocks = self.opcodes = None
0278         self.fullbcount = None
0279         self.__chain_b()
0280 
0281     # For each element x in b, set b2j[x] to a list of the indices in
0282     # b where x appears; the indices are in increasing order; note that
0283     # the number of times x appears in b is len(b2j[x]) ...
0284     # when self.isjunk is defined, junk elements don't show up in this
0285     # map at all, which stops the central find_longest_match method
0286     # from starting any matching block at a junk element ...
0287     # also creates the fast isbjunk function ...
0288     # b2j also does not contain entries for "popular" elements, meaning
0289     # elements that account for more than 1% of the total elements, and
0290     # when the sequence is reasonably large (>= 200 elements); this can
0291     # be viewed as an adaptive notion of semi-junk, and yields an enormous
0292     # speedup when, e.g., comparing program files with hundreds of
0293     # instances of "return NULL;" ...
0294     # note that this is only called when b changes; so for cross-product
0295     # kinds of matches, it's best to call set_seq2 once, then set_seq1
0296     # repeatedly
0297 
0298     def __chain_b(self):
0299         # Because isjunk is a user-defined (not C) function, and we test
0300         # for junk a LOT, it's important to minimize the number of calls.
0301         # Before the tricks described here, __chain_b was by far the most
0302         # time-consuming routine in the whole module!  If anyone sees
0303         # Jim Roskind, thank him again for profile.py -- I never would
0304         # have guessed that.
0305         # The first trick is to build b2j ignoring the possibility
0306         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
0307         # out the junk later is much cheaper than building b2j "right"
0308         # from the start.
0309         b = self.b
0310         n = len(b)
0311         self.b2j = b2j = {}
0312         populardict = {}
0313         for i, elt in enumerate(b):
0314             if elt in b2j:
0315                 indices = b2j[elt]
0316                 if n >= 200 and len(indices) * 100 > n:
0317                     populardict[elt] = 1
0318                     del indices[:]
0319                 else:
0320                     indices.append(i)
0321             else:
0322                 b2j[elt] = [i]
0323 
0324         # Purge leftover indices for popular elements.
0325         for elt in populardict:
0326             del b2j[elt]
0327 
0328         # Now b2j.keys() contains elements uniquely, and especially when
0329         # the sequence is a string, that's usually a good deal smaller
0330         # than len(string).  The difference is the number of isjunk calls
0331         # saved.
0332         isjunk = self.isjunk
0333         junkdict = {}
0334         if isjunk:
0335             for d in populardict, b2j:
0336                 for elt in d.keys():
0337                     if isjunk(elt):
0338                         junkdict[elt] = 1
0339                         del d[elt]
0340 
0341         # Now for x in b, isjunk(x) == x in junkdict, but the
0342         # latter is much faster.  Note too that while there may be a
0343         # lot of junk in the sequence, the number of *unique* junk
0344         # elements is probably small.  So the memory burden of keeping
0345         # this dict alive is likely trivial compared to the size of b2j.
0346         self.isbjunk = junkdict.has_key
0347         self.isbpopular = populardict.has_key
0348 
0349     def find_longest_match(self, alo, ahi, blo, bhi):
0350         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
0351 
0352         If isjunk is not defined:
0353 
0354         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
0355             alo <= i <= i+k <= ahi
0356             blo <= j <= j+k <= bhi
0357         and for all (i',j',k') meeting those conditions,
0358             k >= k'
0359             i <= i'
0360             and if i == i', j <= j'
0361 
0362         In other words, of all maximal matching blocks, return one that
0363         starts earliest in a, and of all those maximal matching blocks that
0364         start earliest in a, return the one that starts earliest in b.
0365 
0366         >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
0367         >>> s.find_longest_match(0, 5, 0, 9)
0368         (0, 4, 5)
0369 
0370         If isjunk is defined, first the longest matching block is
0371         determined as above, but with the additional restriction that no
0372         junk element appears in the block.  Then that block is extended as
0373         far as possible by matching (only) junk elements on both sides.  So
0374         the resulting block never matches on junk except as identical junk
0375         happens to be adjacent to an "interesting" match.
0376 
0377         Here's the same example as before, but considering blanks to be
0378         junk.  That prevents " abcd" from matching the " abcd" at the tail
0379         end of the second sequence directly.  Instead only the "abcd" can
0380         match, and matches the leftmost "abcd" in the second sequence:
0381 
0382         >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
0383         >>> s.find_longest_match(0, 5, 0, 9)
0384         (1, 0, 4)
0385 
0386         If no blocks match, return (alo, blo, 0).
0387 
0388         >>> s = SequenceMatcher(None, "ab", "c")
0389         >>> s.find_longest_match(0, 2, 0, 1)
0390         (0, 0, 0)
0391         """
0392 
0393         # CAUTION:  stripping common prefix or suffix would be incorrect.
0394         # E.g.,
0395         #    ab
0396         #    acab
0397         # Longest matching block is "ab", but if common prefix is
0398         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
0399         # strip, so ends up claiming that ab is changed to acab by
0400         # inserting "ca" in the middle.  That's minimal but unintuitive:
0401         # "it's obvious" that someone inserted "ac" at the front.
0402         # Windiff ends up at the same place as diff, but by pairing up
0403         # the unique 'b's and then matching the first two 'a's.
0404 
0405         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
0406         besti, bestj, bestsize = alo, blo, 0
0407         # find longest junk-free match
0408         # during an iteration of the loop, j2len[j] = length of longest
0409         # junk-free match ending with a[i-1] and b[j]
0410         j2len = {}
0411         nothing = []
0412         for i in xrange(alo, ahi):
0413             # look at all instances of a[i] in b; note that because
0414             # b2j has no junk keys, the loop is skipped if a[i] is junk
0415             j2lenget = j2len.get
0416             newj2len = {}
0417             for j in b2j.get(a[i], nothing):
0418                 # a[i] matches b[j]
0419                 if j < blo:
0420                     continue
0421                 if j >= bhi:
0422                     break
0423                 k = newj2len[j] = j2lenget(j-1, 0) + 1
0424                 if k > bestsize:
0425                     besti, bestj, bestsize = i-k+1, j-k+1, k
0426             j2len = newj2len
0427 
0428         # Extend the best by non-junk elements on each end.  In particular,
0429         # "popular" non-junk elements aren't in b2j, which greatly speeds
0430         # the inner loop above, but also means "the best" match so far
0431         # doesn't contain any junk *or* popular non-junk elements.
0432         while besti > alo and bestj > blo and \
0433               not isbjunk(b[bestj-1]) and \
0434               a[besti-1] == b[bestj-1]:
0435             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
0436         while besti+bestsize < ahi and bestj+bestsize < bhi and \
0437               not isbjunk(b[bestj+bestsize]) and \
0438               a[besti+bestsize] == b[bestj+bestsize]:
0439             bestsize += 1
0440 
0441         # Now that we have a wholly interesting match (albeit possibly
0442         # empty!), we may as well suck up the matching junk on each
0443         # side of it too.  Can't think of a good reason not to, and it
0444         # saves post-processing the (possibly considerable) expense of
0445         # figuring out what to do with it.  In the case of an empty
0446         # interesting match, this is clearly the right thing to do,
0447         # because no other kind of match is possible in the regions.
0448         while besti > alo and bestj > blo and \
0449               isbjunk(b[bestj-1]) and \
0450               a[besti-1] == b[bestj-1]:
0451             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
0452         while besti+bestsize < ahi and bestj+bestsize < bhi and \
0453               isbjunk(b[bestj+bestsize]) and \
0454               a[besti+bestsize] == b[bestj+bestsize]:
0455             bestsize = bestsize + 1
0456 
0457         return besti, bestj, bestsize
0458 
0459     def get_matching_blocks(self):
0460         """Return list of triples describing matching subsequences.
0461 
0462         Each triple is of the form (i, j, n), and means that
0463         a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
0464         i and in j.
0465 
0466         The last triple is a dummy, (len(a), len(b), 0), and is the only
0467         triple with n==0.
0468 
0469         >>> s = SequenceMatcher(None, "abxcd", "abcd")
0470         >>> s.get_matching_blocks()
0471         [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
0472         """
0473 
0474         if self.matching_blocks is not None:
0475             return self.matching_blocks
0476         self.matching_blocks = []
0477         la, lb = len(self.a), len(self.b)
0478         self.__helper(0, la, 0, lb, self.matching_blocks)
0479         self.matching_blocks.append( (la, lb, 0) )
0480         return self.matching_blocks
0481 
0482     # builds list of matching blocks covering a[alo:ahi] and
0483     # b[blo:bhi], appending them in increasing order to answer
0484 
0485     def __helper(self, alo, ahi, blo, bhi, answer):
0486         i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
0487         # a[alo:i] vs b[blo:j] unknown
0488         # a[i:i+k] same as b[j:j+k]
0489         # a[i+k:ahi] vs b[j+k:bhi] unknown
0490         if k:
0491             if alo < i and blo < j:
0492                 self.__helper(alo, i, blo, j, answer)
0493             answer.append(x)
0494             if i+k < ahi and j+k < bhi:
0495                 self.__helper(i+k, ahi, j+k, bhi, answer)
0496 
0497     def get_opcodes(self):
0498         """Return list of 5-tuples describing how to turn a into b.
0499 
0500         Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
0501         has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
0502         tuple preceding it, and likewise for j1 == the previous j2.
0503 
0504         The tags are strings, with these meanings:
0505 
0506         'replace':  a[i1:i2] should be replaced by b[j1:j2]
0507         'delete':   a[i1:i2] should be deleted.
0508                     Note that j1==j2 in this case.
0509         'insert':   b[j1:j2] should be inserted at a[i1:i1].
0510                     Note that i1==i2 in this case.
0511         'equal':    a[i1:i2] == b[j1:j2]
0512 
0513         >>> a = "qabxcd"
0514         >>> b = "abycdf"
0515         >>> s = SequenceMatcher(None, a, b)
0516         >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
0517         ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
0518         ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
0519          delete a[0:1] (q) b[0:0] ()
0520           equal a[1:3] (ab) b[0:2] (ab)
0521         replace a[3:4] (x) b[2:3] (y)
0522           equal a[4:6] (cd) b[3:5] (cd)
0523          insert a[6:6] () b[5:6] (f)
0524         """
0525 
0526         if self.opcodes is not None:
0527             return self.opcodes
0528         i = j = 0
0529         self.opcodes = answer = []
0530         for ai, bj, size in self.get_matching_blocks():
0531             # invariant:  we've pumped out correct diffs to change
0532             # a[:i] into b[:j], and the next matching block is
0533             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
0534             # out a diff to change a[i:ai] into b[j:bj], pump out
0535             # the matching block, and move (i,j) beyond the match
0536             tag = ''
0537             if i < ai and j < bj:
0538                 tag = 'replace'
0539             elif i < ai:
0540                 tag = 'delete'
0541             elif j < bj:
0542                 tag = 'insert'
0543             if tag:
0544                 answer.append( (tag, i, ai, j, bj) )
0545             i, j = ai+size, bj+size
0546             # the list of matching blocks is terminated by a
0547             # sentinel with size 0
0548             if size:
0549                 answer.append( ('equal', ai, i, bj, j) )
0550         return answer
0551 
0552     def get_grouped_opcodes(self, n=3):
0553         """ Isolate change clusters by eliminating ranges with no changes.
0554 
0555         Return a generator of groups with upto n lines of context.
0556         Each group is in the same format as returned by get_opcodes().
0557 
0558         >>> from pprint import pprint
0559         >>> a = map(str, range(1,40))
0560         >>> b = a[:]
0561         >>> b[8:8] = ['i']     # Make an insertion
0562         >>> b[20] += 'x'       # Make a replacement
0563         >>> b[23:28] = []      # Make a deletion
0564         >>> b[30] += 'y'       # Make another replacement
0565         >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
0566         [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
0567          [('equal', 16, 19, 17, 20),
0568           ('replace', 19, 20, 20, 21),
0569           ('equal', 20, 22, 21, 23),
0570           ('delete', 22, 27, 23, 23),
0571           ('equal', 27, 30, 23, 26)],
0572          [('equal', 31, 34, 27, 30),
0573           ('replace', 34, 35, 30, 31),
0574           ('equal', 35, 38, 31, 34)]]
0575         """
0576 
0577         codes = self.get_opcodes()
0578         if not codes:
0579             codes = [("equal", 0, 1, 0, 1)]
0580         # Fixup leading and trailing groups if they show no changes.
0581         if codes[0][0] == 'equal':
0582             tag, i1, i2, j1, j2 = codes[0]
0583             codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
0584         if codes[-1][0] == 'equal':
0585             tag, i1, i2, j1, j2 = codes[-1]
0586             codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
0587 
0588         nn = n + n
0589         group = []
0590         for tag, i1, i2, j1, j2 in codes:
0591             # End the current group and start a new one whenever
0592             # there is a large range with no changes.
0593             if tag == 'equal' and i2-i1 > nn:
0594                 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
0595                 yield group
0596                 group = []
0597                 i1, j1 = max(i1, i2-n), max(j1, j2-n)
0598             group.append((tag, i1, i2, j1 ,j2))
0599         if group and not (len(group)==1 and group[0][0] == 'equal'):
0600             yield group
0601 
0602     def ratio(self):
0603         """Return a measure of the sequences' similarity (float in [0,1]).
0604 
0605         Where T is the total number of elements in both sequences, and
0606         M is the number of matches, this is 2.0*M / T.
0607         Note that this is 1 if the sequences are identical, and 0 if
0608         they have nothing in common.
0609 
0610         .ratio() is expensive to compute if you haven't already computed
0611         .get_matching_blocks() or .get_opcodes(), in which case you may
0612         want to try .quick_ratio() or .real_quick_ratio() first to get an
0613         upper bound.
0614 
0615         >>> s = SequenceMatcher(None, "abcd", "bcde")
0616         >>> s.ratio()
0617         0.75
0618         >>> s.quick_ratio()
0619         0.75
0620         >>> s.real_quick_ratio()
0621         1.0
0622         """
0623 
0624         matches = reduce(lambda sum, triple: sum + triple[-1],
0625                          self.get_matching_blocks(), 0)
0626         return _calculate_ratio(matches, len(self.a) + len(self.b))
0627 
0628     def quick_ratio(self):
0629         """Return an upper bound on ratio() relatively quickly.
0630 
0631         This isn't defined beyond that it is an upper bound on .ratio(), and
0632         is faster to compute.
0633         """
0634 
0635         # viewing a and b as multisets, set matches to the cardinality
0636         # of their intersection; this counts the number of matches
0637         # without regard to order, so is clearly an upper bound
0638         if self.fullbcount is None:
0639             self.fullbcount = fullbcount = {}
0640             for elt in self.b:
0641                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
0642         fullbcount = self.fullbcount
0643         # avail[x] is the number of times x appears in 'b' less the
0644         # number of times we've seen it in 'a' so far ... kinda
0645         avail = {}
0646         availhas, matches = avail.has_key, 0
0647         for elt in self.a:
0648             if availhas(elt):
0649                 numb = avail[elt]
0650             else:
0651                 numb = fullbcount.get(elt, 0)
0652             avail[elt] = numb - 1
0653             if numb > 0:
0654                 matches = matches + 1
0655         return _calculate_ratio(matches, len(self.a) + len(self.b))
0656 
0657     def real_quick_ratio(self):
0658         """Return an upper bound on ratio() very quickly.
0659 
0660         This isn't defined beyond that it is an upper bound on .ratio(), and
0661         is faster to compute than either .ratio() or .quick_ratio().
0662         """
0663 
0664         la, lb = len(self.a), len(self.b)
0665         # can't have more matches than the number of elements in the
0666         # shorter sequence
0667         return _calculate_ratio(min(la, lb), la + lb)
0668 
0669 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
0670     """Use SequenceMatcher to return list of the best "good enough" matches.
0671 
0672     word is a sequence for which close matches are desired (typically a
0673     string).
0674 
0675     possibilities is a list of sequences against which to match word
0676     (typically a list of strings).
0677 
0678     Optional arg n (default 3) is the maximum number of close matches to
0679     return.  n must be > 0.
0680 
0681     Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
0682     that don't score at least that similar to word are ignored.
0683 
0684     The best (no more than n) matches among the possibilities are returned
0685     in a list, sorted by similarity score, most similar first.
0686 
0687     >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
0688     ['apple', 'ape']
0689     >>> import keyword as _keyword
0690     >>> get_close_matches("wheel", _keyword.kwlist)
0691     ['while']
0692     >>> get_close_matches("apple", _keyword.kwlist)
0693     []
0694     >>> get_close_matches("accept", _keyword.kwlist)
0695     ['except']
0696     """
0697 
0698     if not n >  0:
0699         raise ValueError("n must be > 0: %r" % (n,))
0700     if not 0.0 <= cutoff <= 1.0:
0701         raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
0702     result = []
0703     s = SequenceMatcher()
0704     s.set_seq2(word)
0705     for x in possibilities:
0706         s.set_seq1(x)
0707         if s.real_quick_ratio() >= cutoff and \
0708            s.quick_ratio() >= cutoff and \
0709            s.ratio() >= cutoff:
0710             result.append((s.ratio(), x))
0711 
0712     # Move the best scorers to head of list
0713     result = heapq.nlargest(n, result)
0714     # Strip scores for the best n matches
0715     return [x for score, x in result]
0716 
0717 def _count_leading(line, ch):
0718     """
0719     Return number of `ch` characters at the start of `line`.
0720 
0721     Example:
0722 
0723     >>> _count_leading('   abc', ' ')
0724     3
0725     """
0726 
0727     i, n = 0, len(line)
0728     while i < n and line[i] == ch:
0729         i += 1
0730     return i
0731 
0732 class Differ:
0733     r"""
0734     Differ is a class for comparing sequences of lines of text, and
0735     producing human-readable differences or deltas.  Differ uses
0736     SequenceMatcher both to compare sequences of lines, and to compare
0737     sequences of characters within similar (near-matching) lines.
0738 
0739     Each line of a Differ delta begins with a two-letter code:
0740 
0741         '- '    line unique to sequence 1
0742         '+ '    line unique to sequence 2
0743         '  '    line common to both sequences
0744         '? '    line not present in either input sequence
0745 
0746     Lines beginning with '? ' attempt to guide the eye to intraline
0747     differences, and were not present in either input sequence.  These lines
0748     can be confusing if the sequences contain tab characters.
0749 
0750     Note that Differ makes no claim to produce a *minimal* diff.  To the
0751     contrary, minimal diffs are often counter-intuitive, because they synch
0752     up anywhere possible, sometimes accidental matches 100 pages apart.
0753     Restricting synch points to contiguous matches preserves some notion of
0754     locality, at the occasional cost of producing a longer diff.
0755 
0756     Example: Comparing two texts.
0757 
0758     First we set up the texts, sequences of individual single-line strings
0759     ending with newlines (such sequences can also be obtained from the
0760     `readlines()` method of file-like objects):
0761 
0762     >>> text1 = '''  1. Beautiful is better than ugly.
0763     ...   2. Explicit is better than implicit.
0764     ...   3. Simple is better than complex.
0765     ...   4. Complex is better than complicated.
0766     ... '''.splitlines(1)
0767     >>> len(text1)
0768     4
0769     >>> text1[0][-1]
0770     '\n'
0771     >>> text2 = '''  1. Beautiful is better than ugly.
0772     ...   3.   Simple is better than complex.
0773     ...   4. Complicated is better than complex.
0774     ...   5. Flat is better than nested.
0775     ... '''.splitlines(1)
0776 
0777     Next we instantiate a Differ object:
0778 
0779     >>> d = Differ()
0780 
0781     Note that when instantiating a Differ object we may pass functions to
0782     filter out line and character 'junk'.  See Differ.__init__ for details.
0783 
0784     Finally, we compare the two:
0785 
0786     >>> result = list(d.compare(text1, text2))
0787 
0788     'result' is a list of strings, so let's pretty-print it:
0789 
0790     >>> from pprint import pprint as _pprint
0791     >>> _pprint(result)
0792     ['    1. Beautiful is better than ugly.\n',
0793      '-   2. Explicit is better than implicit.\n',
0794      '-   3. Simple is better than complex.\n',
0795      '+   3.   Simple is better than complex.\n',
0796      '?     ++\n',
0797      '-   4. Complex is better than complicated.\n',
0798      '?            ^                     ---- ^\n',
0799      '+   4. Complicated is better than complex.\n',
0800      '?           ++++ ^                      ^\n',
0801      '+   5. Flat is better than nested.\n']
0802 
0803     As a single multi-line string it looks like this:
0804 
0805     >>> print ''.join(result),
0806         1. Beautiful is better than ugly.
0807     -   2. Explicit is better than implicit.
0808     -   3. Simple is better than complex.
0809     +   3.   Simple is better than complex.
0810     ?     ++
0811     -   4. Complex is better than complicated.
0812     ?            ^                     ---- ^
0813     +   4. Complicated is better than complex.
0814     ?           ++++ ^                      ^
0815     +   5. Flat is better than nested.
0816 
0817     Methods:
0818 
0819     __init__(linejunk=None, charjunk=None)
0820         Construct a text differencer, with optional filters.
0821 
0822     compare(a, b)
0823         Compare two sequences of lines; generate the resulting delta.
0824     """
0825 
0826     def __init__(self, linejunk=None, charjunk=None):
0827         """
0828         Construct a text differencer, with optional filters.
0829 
0830         The two optional keyword parameters are for filter functions:
0831 
0832         - `linejunk`: A function that should accept a single string argument,
0833           and return true iff the string is junk. The module-level function
0834           `IS_LINE_JUNK` may be used to filter out lines without visible
0835           characters, except for at most one splat ('#').  It is recommended
0836           to leave linejunk None; as of Python 2.3, the underlying
0837           SequenceMatcher class has grown an adaptive notion of "noise" lines
0838           that's better than any static definition the author has ever been
0839           able to craft.
0840 
0841         - `charjunk`: A function that should accept a string of length 1. The
0842           module-level function `IS_CHARACTER_JUNK` may be used to filter out
0843           whitespace characters (a blank or tab; **note**: bad idea to include
0844           newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
0845         """
0846 
0847         self.linejunk = linejunk
0848         self.charjunk = charjunk
0849 
0850     def compare(self, a, b):
0851         r"""
0852         Compare two sequences of lines; generate the resulting delta.
0853 
0854         Each sequence must contain individual single-line strings ending with
0855         newlines. Such sequences can be obtained from the `readlines()` method
0856         of file-like objects.  The delta generated also consists of newline-
0857         terminated strings, ready to be printed as-is via the writeline()
0858         method of a file-like object.
0859 
0860         Example:
0861 
0862         >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
0863         ...                                'ore\ntree\nemu\n'.splitlines(1))),
0864         - one
0865         ?  ^
0866         + ore
0867         ?  ^
0868         - two
0869         - three
0870         ?  -
0871         + tree
0872         + emu
0873         """
0874 
0875         cruncher = SequenceMatcher(self.linejunk, a, b)
0876         for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
0877             if tag == 'replace':
0878                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
0879             elif tag == 'delete':
0880                 g = self._dump('-', a, alo, ahi)
0881             elif tag == 'insert':
0882                 g = self._dump('+', b, blo, bhi)
0883             elif tag == 'equal':
0884                 g = self._dump(' ', a, alo, ahi)
0885             else:
0886                 raise ValueError, 'unknown tag %r' % (tag,)
0887 
0888             for line in g:
0889                 yield line
0890 
0891     def _dump(self, tag, x, lo, hi):
0892         """Generate comparison results for a same-tagged range."""
0893         for i in xrange(lo, hi):
0894             yield '%s %s' % (tag, x[i])
0895 
0896     def _plain_replace(self, a, alo, ahi, b, blo, bhi):
0897         assert alo < ahi and blo < bhi
0898         # dump the shorter block first -- reduces the burden on short-term
0899         # memory if the blocks are of very different sizes
0900         if bhi - blo < ahi - alo:
0901             first  = self._dump('+', b, blo, bhi)
0902             second = self._dump('-', a, alo, ahi)
0903         else:
0904             first  = self._dump('-', a, alo, ahi)
0905             second = self._dump('+', b, blo, bhi)
0906 
0907         for g in first, second:
0908             for line in g:
0909                 yield line
0910 
0911     def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
0912         r"""
0913         When replacing one block of lines with another, search the blocks
0914         for *similar* lines; the best-matching pair (if any) is used as a
0915         synch point, and intraline difference marking is done on the
0916         similar pair. Lots of work, but often worth it.
0917 
0918         Example:
0919 
0920         >>> d = Differ()
0921         >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
0922         ...                            ['abcdefGhijkl\n'], 0, 1)
0923         >>> print ''.join(results),
0924         - abcDefghiJkl
0925         ?    ^  ^  ^
0926         + abcdefGhijkl
0927         ?    ^  ^  ^
0928         """
0929 
0930         # don't synch up unless the lines have a similarity score of at
0931         # least cutoff; best_ratio tracks the best score seen so far
0932         best_ratio, cutoff = 0.74, 0.75
0933         cruncher = SequenceMatcher(self.charjunk)
0934         eqi, eqj = None, None   # 1st indices of equal lines (if any)
0935 
0936         # search for the pair that matches best without being identical
0937         # (identical lines must be junk lines, & we don't want to synch up
0938         # on junk -- unless we have to)
0939         for j in xrange(blo, bhi):
0940             bj = b[j]
0941             cruncher.set_seq2(bj)
0942             for i in xrange(alo, ahi):
0943                 ai = a[i]
0944                 if ai == bj:
0945                     if eqi is None:
0946                         eqi, eqj = i, j
0947                     continue
0948                 cruncher.set_seq1(ai)
0949                 # computing similarity is expensive, so use the quick
0950                 # upper bounds first -- have seen this speed up messy
0951                 # compares by a factor of 3.
0952                 # note that ratio() is only expensive to compute the first
0953                 # time it's called on a sequence pair; the expensive part
0954                 # of the computation is cached by cruncher
0955                 if cruncher.real_quick_ratio() > best_ratio and \
0956                       cruncher.quick_ratio() > best_ratio and \
0957                       cruncher.ratio() > best_ratio:
0958                     best_ratio, best_i, best_j = cruncher.ratio(), i, j
0959         if best_ratio < cutoff:
0960             # no non-identical "pretty close" pair
0961             if eqi is None:
0962                 # no identical pair either -- treat it as a straight replace
0963                 for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
0964                     yield line
0965                 return
0966             # no close pair, but an identical pair -- synch up on that
0967             best_i, best_j, best_ratio = eqi, eqj, 1.0
0968         else:
0969             # there's a close pair, so forget the identical pair (if any)
0970             eqi = None
0971 
0972         # a[best_i] very similar to b[best_j]; eqi is None iff they're not
0973         # identical
0974 
0975         # pump out diffs from before the synch point
0976         for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
0977             yield line
0978 
0979         # do intraline marking on the synch pair
0980         aelt, belt = a[best_i], b[best_j]
0981         if eqi is None:
0982             # pump out a '-', '?', '+', '?' quad for the synched lines
0983             atags = btags = ""
0984             cruncher.set_seqs(aelt, belt)
0985             for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
0986                 la, lb = ai2 - ai1, bj2 - bj1
0987                 if tag == 'replace':
0988                     atags += '^' * la
0989                     btags += '^' * lb
0990                 elif tag == 'delete':
0991                     atags += '-' * la
0992                 elif tag == 'insert':
0993                     btags += '+' * lb
0994                 elif tag == 'equal':
0995                     atags += ' ' * la
0996                     btags += ' ' * lb
0997                 else:
0998                     raise ValueError, 'unknown tag %r' % (tag,)
0999             for line in self._qformat(aelt, belt, atags, btags):
1000                 yield line
1001         else:
1002             # the synch pair is identical
1003             yield '  ' + aelt
1004 
1005         # pump out diffs from after the synch point
1006         for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
1007             yield line
1008 
1009     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
1010         g = []
1011         if alo < ahi:
1012             if blo < bhi:
1013                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
1014             else:
1015                 g = self._dump('-', a, alo, ahi)
1016         elif blo < bhi:
1017             g = self._dump('+', b, blo, bhi)
1018 
1019         for line in g:
1020             yield line
1021 
1022     def _qformat(self, aline, bline, atags, btags):
1023         r"""
1024         Format "?" output and deal with leading tabs.
1025 
1026         Example:
1027 
1028         >>> d = Differ()
1029         >>> results = d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n',
1030         ...                      '  ^ ^  ^      ', '+  ^ ^  ^      ')
1031         >>> for line in results: print repr(line)
1032         ...
1033         '- \tabcDefghiJkl\n'
1034         '? \t ^ ^  ^\n'
1035         '+ \t\tabcdefGhijkl\n'
1036         '? \t  ^ ^  ^\n'
1037         """
1038 
1039         # Can hurt, but will probably help most of the time.
1040         common = min(_count_leading(aline, "\t"),
1041                      _count_leading(bline, "\t"))
1042         common = min(common, _count_leading(atags[:common], " "))
1043         atags = atags[common:].rstrip()
1044         btags = btags[common:].rstrip()
1045 
1046         yield "- " + aline
1047         if atags:
1048             yield "? %s%s\n" % ("\t" * common, atags)
1049 
1050         yield "+ " + bline
1051         if btags:
1052             yield "? %s%s\n" % ("\t" * common, btags)
1053 
1054 # With respect to junk, an earlier version of ndiff simply refused to
1055 # *start* a match with a junk element.  The result was cases like this:
1056 #     before: private Thread currentThread;
1057 #     after:  private volatile Thread currentThread;
1058 # If you consider whitespace to be junk, the longest contiguous match
1059 # not starting with junk is "e Thread currentThread".  So ndiff reported
1060 # that "e volatil" was inserted between the 't' and the 'e' in "private".
1061 # While an accurate view, to people that's absurd.  The current version
1062 # looks for matching blocks that are entirely junk-free, then extends the
1063 # longest one of those as far as possible but only with matching junk.
1064 # So now "currentThread" is matched, then extended to suck up the
1065 # preceding blank; then "private" is matched, and extended to suck up the
1066 # following blank; then "Thread" is matched; and finally ndiff reports
1067 # that "volatile " was inserted before "Thread".  The only quibble
1068 # remaining is that perhaps it was really the case that " volatile"
1069 # was inserted after "private".  I can live with that <wink>.
1070 
1071 import re
1072 
1073 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
1074     r"""
1075     Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
1076 
1077     Examples:
1078 
1079     >>> IS_LINE_JUNK('\n')
1080     True
1081     >>> IS_LINE_JUNK('  #   \n')
1082     True
1083     >>> IS_LINE_JUNK('hello\n')
1084     False
1085     """
1086 
1087     return pat(line) is not None
1088 
1089 def IS_CHARACTER_JUNK(ch, ws=" \t"):
1090     r"""
1091     Return 1 for ignorable character: iff `ch` is a space or tab.
1092 
1093     Examples:
1094 
1095     >>> IS_CHARACTER_JUNK(' ')
1096     True
1097     >>> IS_CHARACTER_JUNK('\t')
1098     True
1099     >>> IS_CHARACTER_JUNK('\n')
1100     False
1101     >>> IS_CHARACTER_JUNK('x')
1102     False
1103     """
1104 
1105     return ch in ws
1106 
1107 
1108 def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
1109                  tofiledate='', n=3, lineterm='\n'):
1110     r"""
1111     Compare two sequences of lines; generate the delta as a unified diff.
1112 
1113     Unified diffs are a compact way of showing line changes and a few
1114     lines of context.  The number of context lines is set by 'n' which
1115     defaults to three.
1116 
1117     By default, the diff control lines (those with ---, +++, or @@) are
1118     created with a trailing newline.  This is helpful so that inputs
1119     created from file.readlines() result in diffs that are suitable for
1120     file.writelines() since both the inputs and outputs have trailing
1121     newlines.
1122 
1123     For inputs that do not have trailing newlines, set the lineterm
1124     argument to "" so that the output will be uniformly newline free.
1125 
1126     The unidiff format normally has a header for filenames and modification
1127     times.  Any or all of these may be specified using strings for
1128     'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.  The modification
1129     times are normally expressed in the format returned by time.ctime().
1130 
1131     Example:
1132 
1133     >>> for line in unified_diff('one two three four'.split(),
1134     ...             'zero one tree four'.split(), 'Original', 'Current',
1135     ...             'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
1136     ...             lineterm=''):
1137     ...     print line
1138     --- Original Sat Jan 26 23:30:50 1991
1139     +++ Current Fri Jun 06 10:20:52 2003
1140     @@ -1,4 +1,4 @@
1141     +zero
1142      one
1143     -two
1144     -three
1145     +tree
1146      four
1147     """
1148 
1149     started = False
1150     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1151         if not started:
1152             yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
1153             yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
1154             started = True
1155         i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
1156         yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
1157         for tag, i1, i2, j1, j2 in group:
1158             if tag == 'equal':
1159                 for line in a[i1:i2]:
1160                     yield ' ' + line
1161                 continue
1162             if tag == 'replace' or tag == 'delete':
1163                 for line in a[i1:i2]:
1164                     yield '-' + line
1165             if tag == 'replace' or tag == 'insert':
1166                 for line in b[j1:j2]:
1167                     yield '+' + line
1168 
1169 # See http://www.unix.org/single_unix_specification/
1170 def context_diff(a, b, fromfile='', tofile='',
1171                  fromfiledate='', tofiledate='', n=3, lineterm='\n'):
1172     r"""
1173     Compare two sequences of lines; generate the delta as a context diff.
1174 
1175     Context diffs are a compact way of showing line changes and a few
1176     lines of context.  The number of context lines is set by 'n' which
1177     defaults to three.
1178 
1179     By default, the diff control lines (those with *** or ---) are
1180     created with a trailing newline.  This is helpful so that inputs
1181     created from file.readlines() result in diffs that are suitable for
1182     file.writelines() since both the inputs and outputs have trailing
1183     newlines.
1184 
1185     For inputs that do not have trailing newlines, set the lineterm
1186     argument to "" so that the output will be uniformly newline free.
1187 
1188     The context diff format normally has a header for filenames and
1189     modification times.  Any or all of these may be specified using
1190     strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1191     The modification times are normally expressed in the format returned
1192     by time.ctime().  If not specified, the strings default to blanks.
1193 
1194     Example:
1195 
1196     >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
1197     ...       'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current',
1198     ...       'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')),
1199     *** Original Sat Jan 26 23:30:50 1991
1200     --- Current Fri Jun 06 10:22:46 2003
1201     ***************
1202     *** 1,4 ****
1203       one
1204     ! two
1205     ! three
1206       four
1207     --- 1,4 ----
1208     + zero
1209       one
1210     ! tree
1211       four
1212     """
1213 
1214     started = False
1215     prefixmap = {'insert':'+ ', 'delete':'- ', 'replace':'! ', 'equal':'  '}
1216     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1217         if not started:
1218             yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm)
1219             yield '--- %s %s%s' % (tofile, tofiledate, lineterm)
1220             started = True
1221 
1222         yield '***************%s' % (lineterm,)
1223         if group[-1][2] - group[0][1] >= 2:
1224             yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm)
1225         else:
1226             yield '*** %d ****%s' % (group[-1][2], lineterm)
1227         visiblechanges = [e for e in group if e[0] in ('replace', 'delete')]
1228         if visiblechanges:
1229             for tag, i1, i2, _, _ in group:
1230                 if tag != 'insert':
1231                     for line in a[i1:i2]:
1232                         yield prefixmap[tag] + line
1233 
1234         if group[-1][4] - group[0][3] >= 2:
1235             yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm)
1236         else:
1237             yield '--- %d ----%s' % (group[-1][4], lineterm)
1238         visiblechanges = [e for e in group if e[0] in ('replace', 'insert')]
1239         if visiblechanges:
1240             for tag, _, _, j1, j2 in group:
1241                 if tag != 'delete':
1242                     for line in b[j1:j2]:
1243                         yield prefixmap[tag] + line
1244 
1245 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
1246     r"""
1247     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1248 
1249     Optional keyword parameters `linejunk` and `charjunk` are for filter
1250     functions (or None):
1251 
1252     - linejunk: A function that should accept a single string argument, and
1253       return true iff the string is junk.  The default is None, and is
1254       recommended; as of Python 2.3, an adaptive notion of "noise" lines is
1255       used that does a good job on its own.
1256 
1257     - charjunk: A function that should accept a string of length 1. The
1258       default is module-level function IS_CHARACTER_JUNK, which filters out
1259       whitespace characters (a blank or tab; note: bad idea to include newline
1260       in this!).
1261 
1262     Tools/scripts/ndiff.py is a command-line front-end to this function.
1263 
1264     Example:
1265 
1266     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1267     ...              'ore\ntree\nemu\n'.splitlines(1))
1268     >>> print ''.join(diff),
1269     - one
1270     ?  ^
1271     + ore
1272     ?  ^
1273     - two
1274     - three
1275     ?  -
1276     + tree
1277     + emu
1278     """
1279     return Differ(linejunk, charjunk).compare(a, b)
1280 
1281 def _mdiff(fromlines, tolines, context=None, linejunk=None,
1282            charjunk=IS_CHARACTER_JUNK):
1283     """Returns generator yielding marked up from/to side by side differences.
1284 
1285     Arguments:
1286     fromlines -- list of text lines to compared to tolines
1287     tolines -- list of text lines to be compared to fromlines
1288     context -- number of context lines to display on each side of difference,
1289                if None, all from/to text lines will be generated.
1290     linejunk -- passed on to ndiff (see ndiff documentation)
1291     charjunk -- passed on to ndiff (see ndiff documentation)
1292 
1293     This function returns an interator which returns a tuple:
1294     (from line tuple, to line tuple, boolean flag)
1295 
1296     from/to line tuple -- (line num, line text)
1297         line num -- integer or None (to indicate a context seperation)
1298         line text -- original line text with following markers inserted:
1299             '\0+' -- marks start of added text
1300             '\0-' -- marks start of deleted text
1301             '\0^' -- marks start of changed text
1302             '\1' -- marks end of added/deleted/changed text
1303 
1304     boolean flag -- None indicates context separation, True indicates
1305         either "from" or "to" line contains a change, otherwise False.
1306 
1307     This function/iterator was originally developed to generate side by side
1308     file difference for making HTML pages (see HtmlDiff class for example
1309     usage).
1310 
1311     Note, this function utilizes the ndiff function to generate the side by
1312     side difference markup.  Optional ndiff arguments may be passed to this
1313     function and they in turn will be passed to ndiff.
1314     """
1315     import re
1316 
1317     # regular expression for finding intraline change indices
1318     change_re = re.compile('(\++|\-+|\^+)')
1319 
1320     # create the difference iterator to generate the differences
1321     diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
1322 
1323     def _make_line(lines, format_key, side, num_lines=[0,0]):
1324         """Returns line of text with user's change markup and line formatting.
1325 
1326         lines -- list of lines from the ndiff generator to produce a line of
1327                  text from.  When producing the line of text to return, the
1328                  lines used are removed from this list.
1329         format_key -- '+' return first line in list with "add" markup around
1330                           the entire line.
1331                       '-' return first line in list with "delete" markup around
1332                           the entire line.
1333                       '?' return first line in list with add/delete/change
1334                           intraline markup (indices obtained from second line)
1335                       None return first line in list with no markup
1336         side -- indice into the num_lines list (0=from,1=to)
1337         num_lines -- from/to current line number.  This is NOT intended to be a
1338                      passed parameter.  It is present as a keyword argument to
1339                      maintain memory of the current line numbers between calls
1340                      of this function.
1341 
1342         Note, this function is purposefully not defined at the module scope so
1343         that data it needs from its parent function (within whose context it
1344         is defined) does not need to be of module scope.
1345         """
1346         num_lines[side] += 1
1347         # Handle case where no user markup is to be added, just return line of
1348         # text with user's line format to allow for usage of the line number.
1349         if format_key is None:
1350             return (num_lines[side],lines.pop(0)[2:])
1351         # Handle case of intraline changes
1352         if format_key == '?':
1353             text, markers = lines.pop(0), lines.pop(0)
1354             # find intraline changes (store change type and indices in tuples)
1355             sub_info = []
1356             def record_sub_info(match_object,sub_info=sub_info):
1357                 sub_info.append([match_object.group(1)[0],match_object.span()])
1358                 return match_object.group(1)
1359             change_re.sub(record_sub_info,markers)
1360             # process each tuple inserting our special marks that won't be
1361             # noticed by an xml/html escaper.
1362             for key,(begin,end) in sub_info[::-1]:
1363                 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
1364             text = text[2:]
1365         # Handle case of add/delete entire line
1366         else:
1367             text = lines.pop(0)[2:]
1368             # if line of text is just a newline, insert a space so there is
1369             # something for the user to highlight and see.
1370             if len(text) <= 1:
1371                 text = ' '+text
1372             # insert marks that won't be noticed by an xml/html escaper.
1373             text = '\0' + format_key + text + '\1'
1374         # Return line of text, first allow user's line formatter to do it's
1375         # thing (such as adding the line number) then replace the special
1376         # marks with what the user's change markup.
1377         return (num_lines[side],text)
1378 
1379     def _line_iterator():
1380         """Yields from/to lines of text with a change indication.
1381 
1382         This function is an iterator.  It itself pulls lines from a
1383         differencing iterator, processes them and yields them.  When it can
1384         it yields both a "from" and a "to" line, otherwise it will yield one
1385         or the other.  In addition to yielding the lines of from/to text, a
1386         boolean flag is yielded to indicate if the text line(s) have
1387         differences in them.
1388 
1389         Note, this function is purposefully not defined at the module scope so
1390         that data it needs from its parent function (within whose context it
1391         is defined) does not need to be of module scope.
1392         """
1393         lines = []
1394         num_blanks_pending, num_blanks_to_yield = 0, 0
1395         while True:
1396             # Load up next 4 lines so we can look ahead, create strings which
1397             # are a concatenation of the first character of each of the 4 lines
1398             # so we can do some very readable comparisons.
1399             while len(lines) < 4:
1400                 try:
1401                     lines.append(diff_lines_iterator.next())
1402                 except StopIteration:
1403                     lines.append('X')
1404             s = ''.join([line[0] for line in lines])
1405             if s.startswith('X'):
1406                 # When no more lines, pump out any remaining blank lines so the
1407                 # corresponding add/delete lines get a matching blank line so
1408                 # all line pairs get yielded at the next level.
1409                 num_blanks_to_yield = num_blanks_pending
1410             elif s.startswith('-?+?'):
1411                 # simple intraline change
1412                 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
1413                 continue
1414             elif s.startswith('--++'):
1415                 # in delete block, add block coming: we do NOT want to get
1416                 # caught up on blank lines yet, just process the delete line
1417                 num_blanks_pending -= 1
1418                 yield _make_line(lines,'-',0), None, True
1419                 continue
1420             elif s.startswith('--?+') or s.startswith('--+') or \
1421                  s.startswith('- '):
1422                 # in delete block and see a intraline change or unchanged line
1423                 # coming: yield the delete line and then blanks
1424                 from_line,to_line = _make_line(lines,'-',0), None
1425                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
1426             elif s.startswith('-+?'):
1427                 # intraline change
1428                 yield _make_line(lines,None,0), _make_line(lines,'?',1), True
1429                 continue
1430             elif s.startswith('-?+'):
1431                 # intraline change
1432                 yield _make_line(lines,'?',0), _make_line(lines,None,1), True
1433                 continue
1434             elif s.startswith('-'):
1435                 # delete FROM line
1436                 num_blanks_pending -= 1
1437                 yield _make_line(lines,'-',0), None, True
1438                 continue
1439             elif s.startswith('+--'):
1440                 # in add block, delete block coming: we do NOT want to get
1441                 # caught up on blank lines yet, just process the add line
1442                 num_blanks_pending += 1
1443                 yield None, _make_line(lines,'+',1), True
1444                 continue
1445             elif s.startswith('+ ') or s.startswith('+-'):
1446                 # will be leaving an add block: yield blanks then add line
1447                 from_line, to_line = None, _make_line(lines,'+',1)
1448                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
1449             elif s.startswith('+'):
1450                 # inside an add block, yield the add line
1451                 num_blanks_pending += 1
1452                 yield None, _make_line(lines,'+',1), True
1453                 continue
1454             elif s.startswith(' '):
1455                 # unchanged text, yield it to both sides
1456                 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
1457                 continue
1458             # Catch up on the blank lines so when we yield the next from/to
1459             # pair, they are lined up.
1460             while(num_blanks_to_yield < 0):
1461                 num_blanks_to_yield += 1
1462                 yield None,('','\n'),True
1463             while(num_blanks_to_yield > 0):
1464                 num_blanks_to_yield -= 1
1465                 yield ('','\n'),None,True
1466             if s.startswith('X'):
1467                 raise StopIteration
1468             else:
1469                 yield from_line,to_line,True
1470 
1471     def _line_pair_iterator():
1472         """Yields from/to lines of text with a change indication.
1473 
1474         This function is an iterator.  It itself pulls lines from the line
1475         iterator.  It's difference from that iterator is that this function
1476         always yields a pair of from/to text lines (with the change
1477         indication).  If necessary it will collect single from/to lines
1478         until it has a matching pair from/to pair to yield.
1479 
1480         Note, this function is purposefully not defined at the module scope so
1481         that data it needs from its parent function (within whose context it
1482         is defined) does not need to be of module scope.
1483         """
1484         line_iterator = _line_iterator()
1485         fromlines,tolines=[],[]
1486         while True:
1487             # Collecting lines of text until we have a from/to pair
1488             while (len(fromlines)==0 or len(tolines)==0):
1489                 from_line, to_line, found_diff =line_iterator.next()
1490                 if from_line is not None:
1491                     fromlines.append((from_line,found_diff))
1492                 if to_line is not None:
1493                     tolines.append((to_line,found_diff))
1494             # Once we have a pair, remove them from the collection and yield it
1495             from_line, fromDiff = fromlines.pop(0)
1496             to_line, to_diff = tolines.pop(0)
1497             yield (from_line,to_line,fromDiff or to_diff)
1498 
1499     # Handle case where user does not want context differencing, just yield
1500     # them up without doing anything else with them.
1501     line_pair_iterator = _line_pair_iterator()
1502     if context is None:
1503         while True:
1504             yield line_pair_iterator.next()
1505     # Handle case where user wants context differencing.  We must do some
1506     # storage of lines until we know for sure that they are to be yielded.
1507     else:
1508         context += 1
1509         lines_to_write = 0
1510         while True:
1511             # Store lines up until we find a difference, note use of a
1512             # circular queue because we only need to keep around what
1513             # we need for context.
1514             index, contextLines = 0, [None]*(context)
1515             found_diff = False
1516             while(found_diff is False):
1517                 from_line, to_line, found_diff = line_pair_iterator.next()
1518                 i = index % context
1519                 contextLines[i] = (from_line, to_line, found_diff)
1520                 index += 1
1521             # Yield lines that we have collected so far, but first yield
1522             # the user's separator.
1523             if index > context:
1524                 yield None, None, None
1525                 lines_to_write = context
1526             else:
1527                 lines_to_write = index
1528                 index = 0
1529             while(lines_to_write):
1530                 i = index % context
1531                 index += 1
1532                 yield contextLines[i]
1533                 lines_to_write -= 1
1534             # Now yield the context lines after the change
1535             lines_to_write = context-1
1536             while(lines_to_write):
1537                 from_line, to_line, found_diff = line_pair_iterator.next()
1538                 # If another change within the context, extend the context
1539                 if found_diff:
1540                     lines_to_write = context-1
1541                 else:
1542                     lines_to_write -= 1
1543                 yield from_line, to_line, found_diff
1544 
1545 
1546 _file_template = """
1547 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
1548           "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
1549 
1550 <html>
1551 
1552 <head>
1553     <meta http-equiv="Content-Type"
1554           content="text/html; charset=ISO-8859-1" />
1555     <title></title>
1556     <style type="text/css">%(styles)s
1557     </style>
1558 </head>
1559 
1560 <body>
1561     %(table)s%(legend)s
1562 </body>
1563 
1564 </html>"""
1565 
1566 _styles = """
1567         table.diff {font-family:Courier; border:medium;}
1568         .diff_header {background-color:#e0e0e0}
1569         td.diff_header {text-align:right}
1570         .diff_next {background-color:#c0c0c0}
1571         .diff_add {background-color:#aaffaa}
1572         .diff_chg {background-color:#ffff77}
1573         .diff_sub {background-color:#ffaaaa}"""
1574 
1575 _table_template = """
1576     <table class="diff" id="difflib_chg_%(prefix)s_top"
1577            cellspacing="0" cellpadding="0" rules="groups" >
1578         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1579         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1580         %(header_row)s
1581         <tbody>
1582 %(data_rows)s        </tbody>
1583     </table>"""
1584 
1585 _legend = """
1586     <table class="diff" summary="Legends">
1587         <tr> <th colspan="2"> Legends </th> </tr>
1588         <tr> <td> <table border="" summary="Colors">
1589                       <tr><th> Colors </th> </tr>
1590                       <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
1591                       <tr><td class="diff_chg">Changed</td> </tr>
1592                       <tr><td class="diff_sub">Deleted</td> </tr>
1593                   </table></td>
1594              <td> <table border="" summary="Links">
1595                       <tr><th colspan="2"> Links </th> </tr>
1596                       <tr><td>(f)irst change</td> </tr>
1597                       <tr><td>(n)ext change</td> </tr>
1598                       <tr><td>(t)op</td> </tr>
1599                   </table></td> </tr>
1600     </table>"""
1601 
1602 class HtmlDiff(object):
1603     """For producing HTML side by side comparison with change highlights.
1604 
1605     This class can be used to create an HTML table (or a complete HTML file
1606     containing the table) showing a side by side, line by line comparison
1607     of text with inter-line and intra-line change highlights.  The table can
1608     be generated in either full or contextual difference mode.
1609 
1610     The following methods are provided for HTML generation:
1611 
1612     make_table -- generates HTML for a single side by side table
1613     make_file -- generates complete HTML file with a single side by side table
1614 
1615     See tools/scripts/diff.py for an example usage of this class.
1616     """
1617 
1618     _file_template = _file_template
1619     _styles = _styles
1620     _table_template = _table_template
1621     _legend = _legend
1622     _default_prefix = 0
1623 
1624     def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
1625                  charjunk=IS_CHARACTER_JUNK):
1626         """HtmlDiff instance initializer
1627 
1628         Arguments:
1629         tabsize -- tab stop spacing, defaults to 8.
1630         wrapcolumn -- column number where lines are broken and wrapped,
1631             defaults to None where lines are not wrapped.
1632         linejunk,charjunk -- keyword arguments passed into ndiff() (used to by
1633             HtmlDiff() to generate the side by side HTML differences).  See
1634             ndiff() documentation for argument default values and descriptions.
1635         """
1636         self._tabsize = tabsize
1637         self._wrapcolumn = wrapcolumn
1638         self._linejunk = linejunk
1639         self._charjunk = charjunk
1640 
1641     def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1642                   numlines=5):
1643         """Returns HTML file of side by side comparison with change highlights
1644 
1645         Arguments:
1646         fromlines -- list of "from" lines
1647         tolines -- list of "to" lines
1648         fromdesc -- "from" file column header string
1649         todesc -- "to" file column header string
1650         context -- set to True for contextual differences (defaults to False
1651             which shows full differences).
1652         numlines -- number of context lines.  When context is set True,
1653             controls number of lines displayed before and after the change.
1654             When context is False, controls the number of lines to place
1655             the "next" link anchors before the next change (so click of
1656             "next" link jumps to just before the change).
1657         """
1658 
1659         return self._file_template % dict(
1660             styles = self._styles,
1661             legend = self._legend,
1662             table = self.make_table(fromlines,tolines,fromdesc,todesc,
1663                                     context=context,numlines=numlines))
1664 
1665     def _tab_newline_replace(self,fromlines,tolines):
1666         """Returns from/to line lists with tabs expanded and newlines removed.
1667 
1668         Instead of tab characters being replaced by the number of spaces
1669         needed to fill in to the next tab stop, this function will fill
1670         the space with tab characters.  This is done so that the difference
1671         algorithms can identify changes in a file when tabs are replaced by
1672         spaces and vice versa.  At the end of the HTML generation, the tab
1673         characters will be replaced with a nonbreakable space.
1674         """
1675         def expand_tabs(line):
1676             # hide real spaces
1677             line = line.replace(' ','\0')
1678             # expand tabs into spaces
1679             line = line.expandtabs(self._tabsize)
1680             # relace spaces from expanded tabs back into tab characters
1681             # (we'll replace them with markup after we do differencing)
1682             line = line.replace(' ','\t')
1683             return line.replace('\0',' ').rstrip('\n')
1684         fromlines = [expand_tabs(line) for line in fromlines]
1685         tolines = [expand_tabs(line) for line in tolines]
1686         return fromlines,tolines
1687 
1688     def _split_line(self,data_list,line_num,text):
1689         """Builds list of text lines by splitting text lines at wrap point
1690 
1691         This function will determine if the input text line needs to be
1692         wrapped (split) into separate lines.  If so, the first wrap point
1693         will be determined and the first line appended to the output
1694         text line list.  This function is used recursively to handle
1695         the second part of the split line to further split it.
1696         """
1697         # if blank line or context separator, just add it to the output list
1698         if not line_num:
1699             data_list.append((line_num,text))
1700             return
1701 
1702         # if line text doesn't need wrapping, just add it to the output list
1703         size = len(text)
1704         max = self._wrapcolumn
1705         if (size <= max) or ((size -(text.count('\0')*3)) <= max):
1706             data_list.append((line_num,text))
1707             return
1708 
1709         # scan text looking for the wrap point, keeping track if the wrap
1710         # point is inside markers
1711         i = 0
1712         n = 0
1713         mark = ''
1714         while n < max and i < size:
1715             if text[i] == '\0':
1716                 i += 1
1717                 mark = text[i]
1718                 i += 1
1719             elif text[i] == '\1':
1720                 i += 1
1721                 mark = ''
1722             else:
1723                 i += 1
1724                 n += 1
1725 
1726         # wrap point is inside text, break it up into separate lines
1727         line1 = text[:i]
1728         line2 = text[i:]
1729 
1730         # if wrap point is inside markers, place end marker at end of first
1731         # line and start marker at beginning of second line because each
1732         # line will have its own table tag markup around it.
1733         if mark:
1734             line1 = line1 + '\1'
1735             line2 = '\0' + mark + line2
1736 
1737         # tack on first line onto the output list
1738         data_list.append((line_num,line1))
1739 
1740         # use this routine again to wrap the remaining text
1741         self._split_line(data_list,'>',line2)
1742 
1743     def _line_wrapper(self,diffs):
1744         """Returns iterator that splits (wraps) mdiff text lines"""
1745 
1746         # pull from/to data and flags from mdiff iterator
1747         for fromdata,todata,flag in diffs:
1748             # check for context separators and pass them through
1749             if flag is None:
1750                 yield fromdata,todata,flag
1751                 continue
1752             (fromline,fromtext),(toline,totext) = fromdata,todata
1753             # for each from/to line split it at the wrap column to form
1754             # list of text lines.
1755             fromlist,tolist = [],[]
1756             self._split_line(fromlist,fromline,fromtext)
1757             self._split_line(tolist,toline,totext)
1758             # yield from/to line in pairs inserting blank lines as
1759             # necessary when one side has more wrapped lines
1760             while fromlist or tolist:
1761                 if fromlist:
1762                     fromdata = fromlist.pop(0)
1763                 else:
1764                     fromdata = ('',' ')
1765                 if tolist:
1766                     todata = tolist.pop(0)
1767                 else:
1768                     todata = ('',' ')
1769                 yield fromdata,todata,flag
1770 
1771     def _collect_lines(self,diffs):
1772         """Collects mdiff output into separate lists
1773 
1774         Before storing the mdiff from/to data into a list, it is converted
1775         into a single line of text with HTML markup.
1776         """
1777 
1778         fromlist,tolist,flaglist = [],[],[]
1779         # pull from/to data and flags from mdiff style iterator
1780         for fromdata,todata,flag in diffs:
1781             try:
1782                 # store HTML markup of the lines into the lists
1783                 fromlist.append(self._format_line(0,flag,*fromdata))
1784                 tolist.append(self._format_line(1,flag,*todata))
1785             except TypeError:
1786                 # exceptions occur for lines where context separators go
1787                 fromlist.append(None)
1788                 tolist.append(None)
1789             flaglist.append(flag)
1790         return fromlist,tolist,flaglist
1791 
1792     def _format_line(self,side,flag,linenum,text):
1793         """Returns HTML markup of "from" / "to" text lines
1794 
1795         side -- 0 or 1 indicating "from" or "to" text
1796         flag -- indicates if difference on line
1797         linenum -- line number (used for line number column)
1798         text -- line text to be marked up
1799         """
1800         try:
1801             linenum = '%d' % linenum
1802             id = ' id="%s%s"' % (self._prefix[side],linenum)
1803         except TypeError:
1804             # handle blank lines where linenum is '>' or ''
1805             id = ''
1806         # replace those things that would get confused with HTML symbols
1807         text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
1808 
1809         # make space non-breakable so they don't get compressed or line wrapped
1810         text = text.replace(' ','&nbsp;').rstrip()
1811 
1812         return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
1813                % (id,linenum,text)
1814 
1815     def _make_prefix(self):
1816         """Create unique anchor prefixes"""
1817 
1818         # Generate a unique anchor prefix so multiple tables
1819         # can exist on the same HTML page without conflicts.
1820         fromprefix = "from%d_" % HtmlDiff._default_prefix
1821         toprefix = "to%d_" % HtmlDiff._default_prefix
1822         HtmlDiff._default_prefix += 1
1823         # store prefixes so line format method has access
1824         self._prefix = [fromprefix,toprefix]
1825 
1826     def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
1827         """Makes list of "next" links"""
1828 
1829         # all anchor names will be generated using the unique "to" prefix
1830         toprefix = self._prefix[1]
1831 
1832         # process change flags, generating middle column of next anchors/links
1833         next_id = ['']*len(flaglist)
1834         next_href = ['']*len(flaglist)
1835         num_chg, in_change = 0, False
1836         last = 0
1837         for i,flag in enumerate(flaglist):
1838             if flag:
1839                 if not in_change:
1840                     in_change = True
1841                     last = i
1842                     # at the beginning of a change, drop an anchor a few lines
1843                     # (the context lines) before the change for the previous
1844                     # link
1845                     i = max([0,i-numlines])
1846                     next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
1847                     # at the beginning of a change, drop a link to the next
1848                     # change
1849                     num_chg += 1
1850                     next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
1851                          toprefix,num_chg)
1852             else:
1853                 in_change = False
1854         # check for cases where there is no content to avoid exceptions
1855         if not flaglist:
1856             flaglist = [False]
1857             next_id = ['']
1858             next_href = ['']
1859             last = 0
1860             if context:
1861                 fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
1862                 tolist = fromlist
1863             else:
1864                 fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
1865         # if not a change on first line, drop a link
1866         if not flaglist[0]:
1867             next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
1868         # redo the last link to link to the top
1869         next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
1870 
1871         return fromlist,tolist,flaglist,next_href,next_id
1872 
1873     def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1874                    numlines=5):
1875         """Returns HTML table of side by side comparison with change highlights
1876 
1877         Arguments:
1878         fromlines -- list of "from" lines
1879         tolines -- list of "to" lines
1880         fromdesc -- "from" file column header string
1881         todesc -- "to" file column header string
1882         context -- set to True for contextual differences (defaults to False
1883             which shows full differences).
1884         numlines -- number of context lines.  When context is set True,
1885             controls number of lines displayed before and after the change.
1886             When context is False, controls the number of lines to place
1887             the "next" link anchors before the next change (so click of
1888             "next" link jumps to just before the change).
1889         """
1890 
1891         # make unique anchor prefixes so that multiple tables may exist
1892         # on the same page without conflict.
1893         self._make_prefix()
1894 
1895         # change tabs to spaces before it gets more difficult after we insert
1896         # markkup
1897         fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
1898 
1899         # create diffs iterator which generates side by side from/to data
1900         if context:
1901             context_lines = numlines
1902         else:
1903             context_lines = None
1904         diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
1905                       charjunk=self._charjunk)
1906 
1907         # set up iterator to wrap lines that exceed desired width
1908         if self._wrapcolumn:
1909             diffs = self._line_wrapper(diffs)
1910 
1911         # collect up from/to lines and flags into lists (also format the lines)
1912         fromlist,tolist,flaglist = self._collect_lines(diffs)
1913 
1914         # process change flags, generating middle column of next anchors/links
1915         fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
1916             fromlist,tolist,flaglist,context,numlines)
1917 
1918         import cStringIO
1919         s = cStringIO.StringIO()
1920         fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
1921               '<td class="diff_next">%s</td>%s</tr>\n'
1922         for i in range(len(flaglist)):
1923             if flaglist[i] is None:
1924                 # mdiff yields None on separator lines skip the bogus ones
1925                 # generated for the first line
1926                 if i > 0:
1927                     s.write('        </tbody>        \n        <tbody>\n')
1928             else:
1929                 s.write( fmt % (next_id[i],next_href[i],fromlist[i],
1930                                            next_href[i],tolist[i]))
1931         if fromdesc or todesc:
1932             header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
1933                 '<th class="diff_next"><br /></th>',
1934                 '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
1935                 '<th class="diff_next"><br /></th>',
1936                 '<th colspan="2" class="diff_header">%s</th>' % todesc)
1937         else:
1938             header_row = ''
1939 
1940         table = self._table_template % dict(
1941             data_rows=s.getvalue(),
1942             header_row=header_row,
1943             prefix=self._prefix[1])
1944 
1945         return table.replace('\0+','<span class="diff_add">'). \
1946                      replace('\0-','<span class="diff_sub">'). \
1947                      replace('\0^','<span class="diff_chg">'). \
1948                      replace('\1','</span>'). \
1949                      replace('\t','&nbsp;')
1950 
1951 del re
1952 
1953 def restore(delta, which):
1954     r"""
1955     Generate one of the two sequences that generated a delta.
1956 
1957     Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
1958     lines originating from file 1 or 2 (parameter `which`), stripping off line
1959     prefixes.
1960 
1961     Examples:
1962 
1963     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1964     ...              'ore\ntree\nemu\n'.splitlines(1))
1965     >>> diff = list(diff)
1966     >>> print ''.join(restore(diff, 1)),
1967     one
1968     two
1969     three
1970     >>> print ''.join(restore(diff, 2)),
1971     ore
1972     tree
1973     emu
1974     """
1975     try:
1976         tag = {1: "- ", 2: "+ "}[int(which)]
1977     except KeyError:
1978         raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
1979                            % which)
1980     prefixes = ("  ", tag)
1981     for line in delta:
1982         if line[:2] in prefixes:
1983             yield line[2:]
1984 
1985 def _test():
1986     import doctest, difflib
1987     return doctest.testmod(difflib)
1988 
1989 if __name__ == "__main__":
1990     _test()
1991 

Generated by PyXR 0.9.4
SourceForge.net Logo