PyXR

c:\python24\lib \ email \ FeedParser.py


0001 # Copyright (C) 2004 Python Software Foundation
0002 # Authors: Baxter, Wouters and Warsaw
0003 # Contact: email-sig@python.org
0004 
0005 """FeedParser - An email feed parser.
0006 
0007 The feed parser implements an interface for incrementally parsing an email
0008 message, line by line.  This has advantages for certain applications, such as
0009 those reading email messages off a socket.
0010 
0011 FeedParser.feed() is the primary interface for pushing new data into the
0012 parser.  It returns when there's nothing more it can do with the available
0013 data.  When you have no more data to push into the parser, call .close().
0014 This completes the parsing and returns the root message object.
0015 
0016 The other advantage of this parser is that it will never throw a parsing
0017 exception.  Instead, when it finds something unexpected, it adds a 'defect' to
0018 the current message.  Defects are just instances that live on the message
0019 object's .defects attribute.
0020 """
0021 
0022 import re
0023 from email import Errors
0024 from email import Message
0025 
0026 NLCRE = re.compile('\r\n|\r|\n')
0027 NLCRE_bol = re.compile('(\r\n|\r|\n)')
0028 NLCRE_eol = re.compile('(\r\n|\r|\n)$')
0029 NLCRE_crack = re.compile('(\r\n|\r|\n)')
0030 headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
0031 EMPTYSTRING = ''
0032 NL = '\n'
0033 
0034 NeedMoreData = object()
0035 
0036 
0037 
0038 class BufferedSubFile(object):
0039     """A file-ish object that can have new data loaded into it.
0040 
0041     You can also push and pop line-matching predicates onto a stack.  When the
0042     current predicate matches the current line, a false EOF response
0043     (i.e. empty string) is returned instead.  This lets the parser adhere to a
0044     simple abstraction -- it parses until EOF closes the current message.
0045     """
0046     def __init__(self):
0047         # The last partial line pushed into this object.
0048         self._partial = ''
0049         # The list of full, pushed lines, in reverse order
0050         self._lines = []
0051         # The stack of false-EOF checking predicates.
0052         self._eofstack = []
0053         # A flag indicating whether the file has been closed or not.
0054         self._closed = False
0055 
0056     def push_eof_matcher(self, pred):
0057         self._eofstack.append(pred)
0058 
0059     def pop_eof_matcher(self):
0060         return self._eofstack.pop()
0061 
0062     def close(self):
0063         # Don't forget any trailing partial line.
0064         self._lines.append(self._partial)
0065         self._closed = True
0066 
0067     def readline(self):
0068         if not self._lines:
0069             if self._closed:
0070                 return ''
0071             return NeedMoreData
0072         # Pop the line off the stack and see if it matches the current
0073         # false-EOF predicate.
0074         line = self._lines.pop()
0075         # RFC 2046, section 5.1.2 requires us to recognize outer level
0076         # boundaries at any level of inner nesting.  Do this, but be sure it's
0077         # in the order of most to least nested.
0078         for ateof in self._eofstack[::-1]:
0079             if ateof(line):
0080                 # We're at the false EOF.  But push the last line back first.
0081                 self._lines.append(line)
0082                 return ''
0083         return line
0084 
0085     def unreadline(self, line):
0086         # Let the consumer push a line back into the buffer.
0087         self._lines.append(line)
0088 
0089     def push(self, data):
0090         """Push some new data into this object."""
0091         # Handle any previous leftovers
0092         data, self._partial = self._partial + data, ''
0093         # Crack into lines, but preserve the newlines on the end of each
0094         parts = NLCRE_crack.split(data)
0095         # The *ahem* interesting behaviour of re.split when supplied grouping
0096         # parentheses is that the last element of the resulting list is the
0097         # data after the final RE.  In the case of a NL/CR terminated string,
0098         # this is the empty string.
0099         self._partial = parts.pop()
0100         # parts is a list of strings, alternating between the line contents
0101         # and the eol character(s).  Gather up a list of lines after
0102         # re-attaching the newlines.
0103         lines = []
0104         for i in range(len(parts) // 2):
0105             lines.append(parts[i*2] + parts[i*2+1])
0106         self.pushlines(lines)
0107 
0108     def pushlines(self, lines):
0109         # Reverse and insert at the front of the lines.
0110         self._lines[:0] = lines[::-1]
0111 
0112     def is_closed(self):
0113         return self._closed
0114 
0115     def __iter__(self):
0116         return self
0117 
0118     def next(self):
0119         line = self.readline()
0120         if line == '':
0121             raise StopIteration
0122         return line
0123 
0124 
0125 
0126 class FeedParser:
0127     """A feed-style parser of email."""
0128 
0129     def __init__(self, _factory=Message.Message):
0130         """_factory is called with no arguments to create a new message obj"""
0131         self._factory = _factory
0132         self._input = BufferedSubFile()
0133         self._msgstack = []
0134         self._parse = self._parsegen().next
0135         self._cur = None
0136         self._last = None
0137         self._headersonly = False
0138 
0139     # Non-public interface for supporting Parser's headersonly flag
0140     def _set_headersonly(self):
0141         self._headersonly = True
0142 
0143     def feed(self, data):
0144         """Push more data into the parser."""
0145         self._input.push(data)
0146         self._call_parse()
0147 
0148     def _call_parse(self):
0149         try:
0150             self._parse()
0151         except StopIteration:
0152             pass
0153 
0154     def close(self):
0155         """Parse all remaining data and return the root message object."""
0156         self._input.close()
0157         self._call_parse()
0158         root = self._pop_message()
0159         assert not self._msgstack
0160         # Look for final set of defects
0161         if root.get_content_maintype() == 'multipart' \
0162                and not root.is_multipart():
0163             root.defects.append(Errors.MultipartInvariantViolationDefect())
0164         return root
0165 
0166     def _new_message(self):
0167         msg = self._factory()
0168         if self._cur and self._cur.get_content_type() == 'multipart/digest':
0169             msg.set_default_type('message/rfc822')
0170         if self._msgstack:
0171             self._msgstack[-1].attach(msg)
0172         self._msgstack.append(msg)
0173         self._cur = msg
0174         self._last = msg
0175 
0176     def _pop_message(self):
0177         retval = self._msgstack.pop()
0178         if self._msgstack:
0179             self._cur = self._msgstack[-1]
0180         else:
0181             self._cur = None
0182         return retval
0183 
0184     def _parsegen(self):
0185         # Create a new message and start by parsing headers.
0186         self._new_message()
0187         headers = []
0188         # Collect the headers, searching for a line that doesn't match the RFC
0189         # 2822 header or continuation pattern (including an empty line).
0190         for line in self._input:
0191             if line is NeedMoreData:
0192                 yield NeedMoreData
0193                 continue
0194             if not headerRE.match(line):
0195                 # If we saw the RFC defined header/body separator
0196                 # (i.e. newline), just throw it away. Otherwise the line is
0197                 # part of the body so push it back.
0198                 if not NLCRE.match(line):
0199                     self._input.unreadline(line)
0200                 break
0201             headers.append(line)
0202         # Done with the headers, so parse them and figure out what we're
0203         # supposed to see in the body of the message.
0204         self._parse_headers(headers)
0205         # Headers-only parsing is a backwards compatibility hack, which was
0206         # necessary in the older parser, which could throw errors.  All
0207         # remaining lines in the input are thrown into the message body.
0208         if self._headersonly:
0209             lines = []
0210             while True:
0211                 line = self._input.readline()
0212                 if line is NeedMoreData:
0213                     yield NeedMoreData
0214                     continue
0215                 if line == '':
0216                     break
0217                 lines.append(line)
0218             self._cur.set_payload(EMPTYSTRING.join(lines))
0219             return
0220         if self._cur.get_content_type() == 'message/delivery-status':
0221             # message/delivery-status contains blocks of headers separated by
0222             # a blank line.  We'll represent each header block as a separate
0223             # nested message object, but the processing is a bit different
0224             # than standard message/* types because there is no body for the
0225             # nested messages.  A blank line separates the subparts.
0226             while True:
0227                 self._input.push_eof_matcher(NLCRE.match)
0228                 for retval in self._parsegen():
0229                     if retval is NeedMoreData:
0230                         yield NeedMoreData
0231                         continue
0232                     break
0233                 msg = self._pop_message()
0234                 # We need to pop the EOF matcher in order to tell if we're at
0235                 # the end of the current file, not the end of the last block
0236                 # of message headers.
0237                 self._input.pop_eof_matcher()
0238                 # The input stream must be sitting at the newline or at the
0239                 # EOF.  We want to see if we're at the end of this subpart, so
0240                 # first consume the blank line, then test the next line to see
0241                 # if we're at this subpart's EOF.
0242                 line = self._input.readline()
0243                 line = self._input.readline()
0244                 if line == '':
0245                     break
0246                 # Not at EOF so this is a line we're going to need.
0247                 self._input.unreadline(line)
0248             return
0249         if self._cur.get_content_maintype() == 'message':
0250             # The message claims to be a message/* type, then what follows is
0251             # another RFC 2822 message.
0252             for retval in self._parsegen():
0253                 if retval is NeedMoreData:
0254                     yield NeedMoreData
0255                     continue
0256                 break
0257             self._pop_message()
0258             return
0259         if self._cur.get_content_maintype() == 'multipart':
0260             boundary = self._cur.get_boundary()
0261             if boundary is None:
0262                 # The message /claims/ to be a multipart but it has not
0263                 # defined a boundary.  That's a problem which we'll handle by
0264                 # reading everything until the EOF and marking the message as
0265                 # defective.
0266                 self._cur.defects.append(Errors.NoBoundaryInMultipartDefect())
0267                 lines = []
0268                 for line in self._input:
0269                     if line is NeedMoreData:
0270                         yield NeedMoreData
0271                         continue
0272                     lines.append(line)
0273                 self._cur.set_payload(EMPTYSTRING.join(lines))
0274                 return
0275             # Create a line match predicate which matches the inter-part
0276             # boundary as well as the end-of-multipart boundary.  Don't push
0277             # this onto the input stream until we've scanned past the
0278             # preamble.
0279             separator = '--' + boundary
0280             boundaryre = re.compile(
0281                 '(?P<sep>' + re.escape(separator) +
0282                 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
0283             capturing_preamble = True
0284             preamble = []
0285             linesep = False
0286             while True:
0287                 line = self._input.readline()
0288                 if line is NeedMoreData:
0289                     yield NeedMoreData
0290                     continue
0291                 if line == '':
0292                     break
0293                 mo = boundaryre.match(line)
0294                 if mo:
0295                     # If we're looking at the end boundary, we're done with
0296                     # this multipart.  If there was a newline at the end of
0297                     # the closing boundary, then we need to initialize the
0298                     # epilogue with the empty string (see below).
0299                     if mo.group('end'):
0300                         linesep = mo.group('linesep')
0301                         break
0302                     # We saw an inter-part boundary.  Were we in the preamble?
0303                     if capturing_preamble:
0304                         if preamble:
0305                             # According to RFC 2046, the last newline belongs
0306                             # to the boundary.
0307                             lastline = preamble[-1]
0308                             eolmo = NLCRE_eol.search(lastline)
0309                             if eolmo:
0310                                 preamble[-1] = lastline[:-len(eolmo.group(0))]
0311                             self._cur.preamble = EMPTYSTRING.join(preamble)
0312                         capturing_preamble = False
0313                         self._input.unreadline(line)
0314                         continue
0315                     # We saw a boundary separating two parts.  Consume any
0316                     # multiple boundary lines that may be following.  Our
0317                     # interpretation of RFC 2046 BNF grammar does not produce
0318                     # body parts within such double boundaries.
0319                     while True:
0320                         line = self._input.readline()
0321                         if line is NeedMoreData:
0322                             yield NeedMoreData
0323                             continue
0324                         mo = boundaryre.match(line)
0325                         if not mo:
0326                             self._input.unreadline(line)
0327                             break
0328                     # Recurse to parse this subpart; the input stream points
0329                     # at the subpart's first line.
0330                     self._input.push_eof_matcher(boundaryre.match)
0331                     for retval in self._parsegen():
0332                         if retval is NeedMoreData:
0333                             yield NeedMoreData
0334                             continue
0335                         break
0336                     # Because of RFC 2046, the newline preceding the boundary
0337                     # separator actually belongs to the boundary, not the
0338                     # previous subpart's payload (or epilogue if the previous
0339                     # part is a multipart).
0340                     if self._last.get_content_maintype() == 'multipart':
0341                         epilogue = self._last.epilogue
0342                         if epilogue == '':
0343                             self._last.epilogue = None
0344                         elif epilogue is not None:
0345                             mo = NLCRE_eol.search(epilogue)
0346                             if mo:
0347                                 end = len(mo.group(0))
0348                                 self._last.epilogue = epilogue[:-end]
0349                     else:
0350                         payload = self._last.get_payload()
0351                         if isinstance(payload, basestring):
0352                             mo = NLCRE_eol.search(payload)
0353                             if mo:
0354                                 payload = payload[:-len(mo.group(0))]
0355                                 self._last.set_payload(payload)
0356                     self._input.pop_eof_matcher()
0357                     self._pop_message()
0358                     # Set the multipart up for newline cleansing, which will
0359                     # happen if we're in a nested multipart.
0360                     self._last = self._cur
0361                 else:
0362                     # I think we must be in the preamble
0363                     assert capturing_preamble
0364                     preamble.append(line)
0365             # We've seen either the EOF or the end boundary.  If we're still
0366             # capturing the preamble, we never saw the start boundary.  Note
0367             # that as a defect and store the captured text as the payload.
0368             # Everything from here to the EOF is epilogue.
0369             if capturing_preamble:
0370                 self._cur.defects.append(Errors.StartBoundaryNotFoundDefect())
0371                 self._cur.set_payload(EMPTYSTRING.join(preamble))
0372                 epilogue = []
0373                 for line in self._input:
0374                     if line is NeedMoreData:
0375                         yield NeedMoreData
0376                         continue
0377                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
0378                 return
0379             # If the end boundary ended in a newline, we'll need to make sure
0380             # the epilogue isn't None
0381             if linesep:
0382                 epilogue = ['']
0383             else:
0384                 epilogue = []
0385             for line in self._input:
0386                 if line is NeedMoreData:
0387                     yield NeedMoreData
0388                     continue
0389                 epilogue.append(line)
0390             # Any CRLF at the front of the epilogue is not technically part of
0391             # the epilogue.  Also, watch out for an empty string epilogue,
0392             # which means a single newline.
0393             if epilogue:
0394                 firstline = epilogue[0]
0395                 bolmo = NLCRE_bol.match(firstline)
0396                 if bolmo:
0397                     epilogue[0] = firstline[len(bolmo.group(0)):]
0398             self._cur.epilogue = EMPTYSTRING.join(epilogue)
0399             return
0400         # Otherwise, it's some non-multipart type, so the entire rest of the
0401         # file contents becomes the payload.
0402         lines = []
0403         for line in self._input:
0404             if line is NeedMoreData:
0405                 yield NeedMoreData
0406                 continue
0407             lines.append(line)
0408         self._cur.set_payload(EMPTYSTRING.join(lines))
0409 
0410     def _parse_headers(self, lines):
0411         # Passed a list of lines that make up the headers for the current msg
0412         lastheader = ''
0413         lastvalue = []
0414         for lineno, line in enumerate(lines):
0415             # Check for continuation
0416             if line[0] in ' \t':
0417                 if not lastheader:
0418                     # The first line of the headers was a continuation.  This
0419                     # is illegal, so let's note the defect, store the illegal
0420                     # line, and ignore it for purposes of headers.
0421                     defect = Errors.FirstHeaderLineIsContinuationDefect(line)
0422                     self._cur.defects.append(defect)
0423                     continue
0424                 lastvalue.append(line)
0425                 continue
0426             if lastheader:
0427                 # XXX reconsider the joining of folded lines
0428                 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
0429                 self._cur[lastheader] = lhdr
0430                 lastheader, lastvalue = '', []
0431             # Check for envelope header, i.e. unix-from
0432             if line.startswith('From '):
0433                 if lineno == 0:
0434                     # Strip off the trailing newline
0435                     mo = NLCRE_eol.search(line)
0436                     if mo:
0437                         line = line[:-len(mo.group(0))]
0438                     self._cur.set_unixfrom(line)
0439                     continue
0440                 elif lineno == len(lines) - 1:
0441                     # Something looking like a unix-from at the end - it's
0442                     # probably the first line of the body, so push back the
0443                     # line and stop.
0444                     self._input.unreadline(line)
0445                     return
0446                 else:
0447                     # Weirdly placed unix-from line.  Note this as a defect
0448                     # and ignore it.
0449                     defect = Errors.MisplacedEnvelopeHeaderDefect(line)
0450                     self._cur.defects.append(defect)
0451                     continue
0452             # Split the line on the colon separating field name from value.
0453             i = line.find(':')
0454             if i < 0:
0455                 defect = Errors.MalformedHeaderDefect(line)
0456                 self._cur.defects.append(defect)
0457                 continue
0458             lastheader = line[:i]
0459             lastvalue = [line[i+1:].lstrip()]
0460         # Done with all the lines, so handle the last header.
0461         if lastheader:
0462             # XXX reconsider the joining of folded lines
0463             self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
0464
Generated by PyXR 0.9.4