PyXR

c:\python24\lib \ textwrap.py


0001 """Text wrapping and filling.
0002 """
0003 
0004 # Copyright (C) 1999-2001 Gregory P. Ward.
0005 # Copyright (C) 2002, 2003 Python Software Foundation.
0006 # Written by Greg Ward <gward@python.net>
0007 
0008 __revision__ = "$Id: textwrap.py,v 1.35 2004/06/03 01:59:40 gward Exp $"
0009 
0010 import string, re
0011 
0012 # Do the right thing with boolean values for all known Python versions
0013 # (so this module can be copied to projects that don't depend on Python
0014 # 2.3, e.g. Optik and Docutils).
0015 try:
0016     True, False
0017 except NameError:
0018     (True, False) = (1, 0)
0019 
0020 __all__ = ['TextWrapper', 'wrap', 'fill']
0021 
0022 # Hardcode the recognized whitespace characters to the US-ASCII
0023 # whitespace characters.  The main reason for doing this is that in
0024 # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
0025 # that character winds up in string.whitespace.  Respecting
0026 # string.whitespace in those cases would 1) make textwrap treat 0xa0 the
0027 # same as any other whitespace char, which is clearly wrong (it's a
0028 # *non-breaking* space), 2) possibly cause problems with Unicode,
0029 # since 0xa0 is not in range(128).
0030 _whitespace = '\t\n\x0b\x0c\r '
0031 
0032 class TextWrapper:
0033     """
0034     Object for wrapping/filling text.  The public interface consists of
0035     the wrap() and fill() methods; the other methods are just there for
0036     subclasses to override in order to tweak the default behaviour.
0037     If you want to completely replace the main wrapping algorithm,
0038     you'll probably have to override _wrap_chunks().
0039 
0040     Several instance attributes control various aspects of wrapping:
0041       width (default: 70)
0042         the maximum width of wrapped lines (unless break_long_words
0043         is false)
0044       initial_indent (default: "")
0045         string that will be prepended to the first line of wrapped
0046         output.  Counts towards the line's width.
0047       subsequent_indent (default: "")
0048         string that will be prepended to all lines save the first
0049         of wrapped output; also counts towards each line's width.
0050       expand_tabs (default: true)
0051         Expand tabs in input text to spaces before further processing.
0052         Each tab will become 1 .. 8 spaces, depending on its position in
0053         its line.  If false, each tab is treated as a single character.
0054       replace_whitespace (default: true)
0055         Replace all whitespace characters in the input text by spaces
0056         after tab expansion.  Note that if expand_tabs is false and
0057         replace_whitespace is true, every tab will be converted to a
0058         single space!
0059       fix_sentence_endings (default: false)
0060         Ensure that sentence-ending punctuation is always followed
0061         by two spaces.  Off by default because the algorithm is
0062         (unavoidably) imperfect.
0063       break_long_words (default: true)
0064         Break words longer than 'width'.  If false, those words will not
0065         be broken, and some lines might be longer than 'width'.
0066     """
0067 
0068     whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
0069 
0070     unicode_whitespace_trans = {}
0071     uspace = ord(u' ')
0072     for x in map(ord, _whitespace):
0073         unicode_whitespace_trans[x] = uspace
0074 
0075     # This funky little regex is just the trick for splitting
0076     # text up into word-wrappable chunks.  E.g.
0077     #   "Hello there -- you goof-ball, use the -b option!"
0078     # splits into
0079     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
0080     # (after stripping out empty strings).
0081     wordsep_re = re.compile(r'(\s+|'                  # any whitespace
0082                             r'[^\s\w]*\w{2,}-(?=\w{2,})|' # hyphenated words
0083                             r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
0084 
0085     # XXX this is not locale- or charset-aware -- string.lowercase
0086     # is US-ASCII only (and therefore English-only)
0087     sentence_end_re = re.compile(r'[%s]'              # lowercase letter
0088                                  r'[\.\!\?]'          # sentence-ending punct.
0089                                  r'[\"\']?'           # optional end-of-quote
0090                                  % string.lowercase)
0091 
0092 
0093     def __init__(self,
0094                  width=70,
0095                  initial_indent="",
0096                  subsequent_indent="",
0097                  expand_tabs=True,
0098                  replace_whitespace=True,
0099                  fix_sentence_endings=False,
0100                  break_long_words=True):
0101         self.width = width
0102         self.initial_indent = initial_indent
0103         self.subsequent_indent = subsequent_indent
0104         self.expand_tabs = expand_tabs
0105         self.replace_whitespace = replace_whitespace
0106         self.fix_sentence_endings = fix_sentence_endings
0107         self.break_long_words = break_long_words
0108 
0109 
0110     # -- Private methods -----------------------------------------------
0111     # (possibly useful for subclasses to override)
0112 
0113     def _munge_whitespace(self, text):
0114         """_munge_whitespace(text : string) -> string
0115 
0116         Munge whitespace in text: expand tabs and convert all other
0117         whitespace characters to spaces.  Eg. " foo\tbar\n\nbaz"
0118         becomes " foo    bar  baz".
0119         """
0120         if self.expand_tabs:
0121             text = text.expandtabs()
0122         if self.replace_whitespace:
0123             if isinstance(text, str):
0124                 text = text.translate(self.whitespace_trans)
0125             elif isinstance(text, unicode):
0126                 text = text.translate(self.unicode_whitespace_trans)
0127         return text
0128 
0129 
0130     def _split(self, text):
0131         """_split(text : string) -> [string]
0132 
0133         Split the text to wrap into indivisible chunks.  Chunks are
0134         not quite the same as words; see wrap_chunks() for full
0135         details.  As an example, the text
0136           Look, goof-ball -- use the -b option!
0137         breaks into the following chunks:
0138           'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
0139           'use', ' ', 'the', ' ', '-b', ' ', 'option!'
0140         """
0141         chunks = self.wordsep_re.split(text)
0142         chunks = filter(None, chunks)
0143         return chunks
0144 
0145     def _fix_sentence_endings(self, chunks):
0146         """_fix_sentence_endings(chunks : [string])
0147 
0148         Correct for sentence endings buried in 'chunks'.  Eg. when the
0149         original text contains "... foo.\nBar ...", munge_whitespace()
0150         and split() will convert that to [..., "foo.", " ", "Bar", ...]
0151         which has one too few spaces; this method simply changes the one
0152         space to two.
0153         """
0154         i = 0
0155         pat = self.sentence_end_re
0156         while i < len(chunks)-1:
0157             if chunks[i+1] == " " and pat.search(chunks[i]):
0158                 chunks[i+1] = "  "
0159                 i += 2
0160             else:
0161                 i += 1
0162 
0163     def _handle_long_word(self, chunks, cur_line, cur_len, width):
0164         """_handle_long_word(chunks : [string],
0165                              cur_line : [string],
0166                              cur_len : int, width : int)
0167 
0168         Handle a chunk of text (most likely a word, not whitespace) that
0169         is too long to fit in any line.
0170         """
0171         space_left = max(width - cur_len, 1)
0172 
0173         # If we're allowed to break long words, then do so: put as much
0174         # of the next chunk onto the current line as will fit.
0175         if self.break_long_words:
0176             cur_line.append(chunks[0][0:space_left])
0177             chunks[0] = chunks[0][space_left:]
0178 
0179         # Otherwise, we have to preserve the long word intact.  Only add
0180         # it to the current line if there's nothing already there --
0181         # that minimizes how much we violate the width constraint.
0182         elif not cur_line:
0183             cur_line.append(chunks.pop(0))
0184 
0185         # If we're not allowed to break long words, and there's already
0186         # text on the current line, do nothing.  Next time through the
0187         # main loop of _wrap_chunks(), we'll wind up here again, but
0188         # cur_len will be zero, so the next line will be entirely
0189         # devoted to the long word that we can't handle right now.
0190 
0191     def _wrap_chunks(self, chunks):
0192         """_wrap_chunks(chunks : [string]) -> [string]
0193 
0194         Wrap a sequence of text chunks and return a list of lines of
0195         length 'self.width' or less.  (If 'break_long_words' is false,
0196         some lines may be longer than this.)  Chunks correspond roughly
0197         to words and the whitespace between them: each chunk is
0198         indivisible (modulo 'break_long_words'), but a line break can
0199         come between any two chunks.  Chunks should not have internal
0200         whitespace; ie. a chunk is either all whitespace or a "word".
0201         Whitespace chunks will be removed from the beginning and end of
0202         lines, but apart from that whitespace is preserved.
0203         """
0204         lines = []
0205         if self.width <= 0:
0206             raise ValueError("invalid width %r (must be > 0)" % self.width)
0207 
0208         while chunks:
0209 
0210             # Start the list of chunks that will make up the current line.
0211             # cur_len is just the length of all the chunks in cur_line.
0212             cur_line = []
0213             cur_len = 0
0214 
0215             # Figure out which static string will prefix this line.
0216             if lines:
0217                 indent = self.subsequent_indent
0218             else:
0219                 indent = self.initial_indent
0220 
0221             # Maximum width for this line.
0222             width = self.width - len(indent)
0223 
0224             # First chunk on line is whitespace -- drop it, unless this
0225             # is the very beginning of the text (ie. no lines started yet).
0226             if chunks[0].strip() == '' and lines:
0227                 del chunks[0]
0228 
0229             while chunks:
0230                 l = len(chunks[0])
0231 
0232                 # Can at least squeeze this chunk onto the current line.
0233                 if cur_len + l <= width:
0234                     cur_line.append(chunks.pop(0))
0235                     cur_len += l
0236 
0237                 # Nope, this line is full.
0238                 else:
0239                     break
0240 
0241             # The current line is full, and the next chunk is too big to
0242             # fit on *any* line (not just this one).
0243             if chunks and len(chunks[0]) > width:
0244                 self._handle_long_word(chunks, cur_line, cur_len, width)
0245 
0246             # If the last chunk on this line is all whitespace, drop it.
0247             if cur_line and cur_line[-1].strip() == '':
0248                 del cur_line[-1]
0249 
0250             # Convert current line back to a string and store it in list
0251             # of all lines (return value).
0252             if cur_line:
0253                 lines.append(indent + ''.join(cur_line))
0254 
0255         return lines
0256 
0257 
0258     # -- Public interface ----------------------------------------------
0259 
0260     def wrap(self, text):
0261         """wrap(text : string) -> [string]
0262 
0263         Reformat the single paragraph in 'text' so it fits in lines of
0264         no more than 'self.width' columns, and return a list of wrapped
0265         lines.  Tabs in 'text' are expanded with string.expandtabs(),
0266         and all other whitespace characters (including newline) are
0267         converted to space.
0268         """
0269         text = self._munge_whitespace(text)
0270         indent = self.initial_indent
0271         chunks = self._split(text)
0272         if self.fix_sentence_endings:
0273             self._fix_sentence_endings(chunks)
0274         return self._wrap_chunks(chunks)
0275 
0276     def fill(self, text):
0277         """fill(text : string) -> string
0278 
0279         Reformat the single paragraph in 'text' to fit in lines of no
0280         more than 'self.width' columns, and return a new string
0281         containing the entire wrapped paragraph.
0282         """
0283         return "\n".join(self.wrap(text))
0284 
0285 
0286 # -- Convenience interface ---------------------------------------------
0287 
0288 def wrap(text, width=70, **kwargs):
0289     """Wrap a single paragraph of text, returning a list of wrapped lines.
0290 
0291     Reformat the single paragraph in 'text' so it fits in lines of no
0292     more than 'width' columns, and return a list of wrapped lines.  By
0293     default, tabs in 'text' are expanded with string.expandtabs(), and
0294     all other whitespace characters (including newline) are converted to
0295     space.  See TextWrapper class for available keyword args to customize
0296     wrapping behaviour.
0297     """
0298     w = TextWrapper(width=width, **kwargs)
0299     return w.wrap(text)
0300 
0301 def fill(text, width=70, **kwargs):
0302     """Fill a single paragraph of text, returning a new string.
0303 
0304     Reformat the single paragraph in 'text' to fit in lines of no more
0305     than 'width' columns, and return a new string containing the entire
0306     wrapped paragraph.  As with wrap(), tabs are expanded and other
0307     whitespace characters converted to space.  See TextWrapper class for
0308     available keyword args to customize wrapping behaviour.
0309     """
0310     w = TextWrapper(width=width, **kwargs)
0311     return w.fill(text)
0312 
0313 
0314 # -- Loosely related functionality -------------------------------------
0315 
0316 def dedent(text):
0317     """dedent(text : string) -> string
0318 
0319     Remove any whitespace than can be uniformly removed from the left
0320     of every line in `text`.
0321 
0322     This can be used e.g. to make triple-quoted strings line up with
0323     the left edge of screen/whatever, while still presenting it in the
0324     source code in indented form.
0325 
0326     For example:
0327 
0328         def test():
0329             # end first line with \ to avoid the empty line!
0330             s = '''\
0331             hello
0332               world
0333             '''
0334             print repr(s)          # prints '    hello\n      world\n    '
0335             print repr(dedent(s))  # prints 'hello\n  world\n'
0336     """
0337     lines = text.expandtabs().split('\n')
0338     margin = None
0339     for line in lines:
0340         content = line.lstrip()
0341         if not content:
0342             continue
0343         indent = len(line) - len(content)
0344         if margin is None:
0345             margin = indent
0346         else:
0347             margin = min(margin, indent)
0348 
0349     if margin is not None and margin > 0:
0350         for i in range(len(lines)):
0351             lines[i] = lines[i][margin:]
0352 
0353     return '\n'.join(lines)
0354
Generated by PyXR 0.9.4