0001 # Copyright (C) 2004 Python Software Foundation 0002 # Authors: Baxter, Wouters and Warsaw 0003 # Contact: email-sig@python.org 0004 0005 """FeedParser - An email feed parser. 0006 0007 The feed parser implements an interface for incrementally parsing an email 0008 message, line by line. This has advantages for certain applications, such as 0009 those reading email messages off a socket. 0010 0011 FeedParser.feed() is the primary interface for pushing new data into the 0012 parser. It returns when there's nothing more it can do with the available 0013 data. When you have no more data to push into the parser, call .close(). 0014 This completes the parsing and returns the root message object. 0015 0016 The other advantage of this parser is that it will never throw a parsing 0017 exception. Instead, when it finds something unexpected, it adds a 'defect' to 0018 the current message. Defects are just instances that live on the message 0019 object's .defects attribute. 0020 """ 0021 0022 import re 0023 from email import Errors 0024 from email import Message 0025 0026 NLCRE = re.compile('\r\n|\r|\n') 0027 NLCRE_bol = re.compile('(\r\n|\r|\n)') 0028 NLCRE_eol = re.compile('(\r\n|\r|\n)$') 0029 NLCRE_crack = re.compile('(\r\n|\r|\n)') 0030 headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])') 0031 EMPTYSTRING = '' 0032 NL = '\n' 0033 0034 NeedMoreData = object() 0035 0036 0037 0038 class BufferedSubFile(object): 0039 """A file-ish object that can have new data loaded into it. 0040 0041 You can also push and pop line-matching predicates onto a stack. When the 0042 current predicate matches the current line, a false EOF response 0043 (i.e. empty string) is returned instead. This lets the parser adhere to a 0044 simple abstraction -- it parses until EOF closes the current message. 0045 """ 0046 def __init__(self): 0047 # The last partial line pushed into this object. 0048 self._partial = '' 0049 # The list of full, pushed lines, in reverse order 0050 self._lines = [] 0051 # The stack of false-EOF checking predicates. 0052 self._eofstack = [] 0053 # A flag indicating whether the file has been closed or not. 0054 self._closed = False 0055 0056 def push_eof_matcher(self, pred): 0057 self._eofstack.append(pred) 0058 0059 def pop_eof_matcher(self): 0060 return self._eofstack.pop() 0061 0062 def close(self): 0063 # Don't forget any trailing partial line. 0064 self._lines.append(self._partial) 0065 self._closed = True 0066 0067 def readline(self): 0068 if not self._lines: 0069 if self._closed: 0070 return '' 0071 return NeedMoreData 0072 # Pop the line off the stack and see if it matches the current 0073 # false-EOF predicate. 0074 line = self._lines.pop() 0075 # RFC 2046, section 5.1.2 requires us to recognize outer level 0076 # boundaries at any level of inner nesting. Do this, but be sure it's 0077 # in the order of most to least nested. 0078 for ateof in self._eofstack[::-1]: 0079 if ateof(line): 0080 # We're at the false EOF. But push the last line back first. 0081 self._lines.append(line) 0082 return '' 0083 return line 0084 0085 def unreadline(self, line): 0086 # Let the consumer push a line back into the buffer. 0087 self._lines.append(line) 0088 0089 def push(self, data): 0090 """Push some new data into this object.""" 0091 # Handle any previous leftovers 0092 data, self._partial = self._partial + data, '' 0093 # Crack into lines, but preserve the newlines on the end of each 0094 parts = NLCRE_crack.split(data) 0095 # The *ahem* interesting behaviour of re.split when supplied grouping 0096 # parentheses is that the last element of the resulting list is the 0097 # data after the final RE. In the case of a NL/CR terminated string, 0098 # this is the empty string. 0099 self._partial = parts.pop() 0100 # parts is a list of strings, alternating between the line contents 0101 # and the eol character(s). Gather up a list of lines after 0102 # re-attaching the newlines. 0103 lines = [] 0104 for i in range(len(parts) // 2): 0105 lines.append(parts[i*2] + parts[i*2+1]) 0106 self.pushlines(lines) 0107 0108 def pushlines(self, lines): 0109 # Reverse and insert at the front of the lines. 0110 self._lines[:0] = lines[::-1] 0111 0112 def is_closed(self): 0113 return self._closed 0114 0115 def __iter__(self): 0116 return self 0117 0118 def next(self): 0119 line = self.readline() 0120 if line == '': 0121 raise StopIteration 0122 return line 0123 0124 0125 0126 class FeedParser: 0127 """A feed-style parser of email.""" 0128 0129 def __init__(self, _factory=Message.Message): 0130 """_factory is called with no arguments to create a new message obj""" 0131 self._factory = _factory 0132 self._input = BufferedSubFile() 0133 self._msgstack = [] 0134 self._parse = self._parsegen().next 0135 self._cur = None 0136 self._last = None 0137 self._headersonly = False 0138 0139 # Non-public interface for supporting Parser's headersonly flag 0140 def _set_headersonly(self): 0141 self._headersonly = True 0142 0143 def feed(self, data): 0144 """Push more data into the parser.""" 0145 self._input.push(data) 0146 self._call_parse() 0147 0148 def _call_parse(self): 0149 try: 0150 self._parse() 0151 except StopIteration: 0152 pass 0153 0154 def close(self): 0155 """Parse all remaining data and return the root message object.""" 0156 self._input.close() 0157 self._call_parse() 0158 root = self._pop_message() 0159 assert not self._msgstack 0160 # Look for final set of defects 0161 if root.get_content_maintype() == 'multipart' \ 0162 and not root.is_multipart(): 0163 root.defects.append(Errors.MultipartInvariantViolationDefect()) 0164 return root 0165 0166 def _new_message(self): 0167 msg = self._factory() 0168 if self._cur and self._cur.get_content_type() == 'multipart/digest': 0169 msg.set_default_type('message/rfc822') 0170 if self._msgstack: 0171 self._msgstack[-1].attach(msg) 0172 self._msgstack.append(msg) 0173 self._cur = msg 0174 self._last = msg 0175 0176 def _pop_message(self): 0177 retval = self._msgstack.pop() 0178 if self._msgstack: 0179 self._cur = self._msgstack[-1] 0180 else: 0181 self._cur = None 0182 return retval 0183 0184 def _parsegen(self): 0185 # Create a new message and start by parsing headers. 0186 self._new_message() 0187 headers = [] 0188 # Collect the headers, searching for a line that doesn't match the RFC 0189 # 2822 header or continuation pattern (including an empty line). 0190 for line in self._input: 0191 if line is NeedMoreData: 0192 yield NeedMoreData 0193 continue 0194 if not headerRE.match(line): 0195 # If we saw the RFC defined header/body separator 0196 # (i.e. newline), just throw it away. Otherwise the line is 0197 # part of the body so push it back. 0198 if not NLCRE.match(line): 0199 self._input.unreadline(line) 0200 break 0201 headers.append(line) 0202 # Done with the headers, so parse them and figure out what we're 0203 # supposed to see in the body of the message. 0204 self._parse_headers(headers) 0205 # Headers-only parsing is a backwards compatibility hack, which was 0206 # necessary in the older parser, which could throw errors. All 0207 # remaining lines in the input are thrown into the message body. 0208 if self._headersonly: 0209 lines = [] 0210 while True: 0211 line = self._input.readline() 0212 if line is NeedMoreData: 0213 yield NeedMoreData 0214 continue 0215 if line == '': 0216 break 0217 lines.append(line) 0218 self._cur.set_payload(EMPTYSTRING.join(lines)) 0219 return 0220 if self._cur.get_content_type() == 'message/delivery-status': 0221 # message/delivery-status contains blocks of headers separated by 0222 # a blank line. We'll represent each header block as a separate 0223 # nested message object, but the processing is a bit different 0224 # than standard message/* types because there is no body for the 0225 # nested messages. A blank line separates the subparts. 0226 while True: 0227 self._input.push_eof_matcher(NLCRE.match) 0228 for retval in self._parsegen(): 0229 if retval is NeedMoreData: 0230 yield NeedMoreData 0231 continue 0232 break 0233 msg = self._pop_message() 0234 # We need to pop the EOF matcher in order to tell if we're at 0235 # the end of the current file, not the end of the last block 0236 # of message headers. 0237 self._input.pop_eof_matcher() 0238 # The input stream must be sitting at the newline or at the 0239 # EOF. We want to see if we're at the end of this subpart, so 0240 # first consume the blank line, then test the next line to see 0241 # if we're at this subpart's EOF. 0242 line = self._input.readline() 0243 line = self._input.readline() 0244 if line == '': 0245 break 0246 # Not at EOF so this is a line we're going to need. 0247 self._input.unreadline(line) 0248 return 0249 if self._cur.get_content_maintype() == 'message': 0250 # The message claims to be a message/* type, then what follows is 0251 # another RFC 2822 message. 0252 for retval in self._parsegen(): 0253 if retval is NeedMoreData: 0254 yield NeedMoreData 0255 continue 0256 break 0257 self._pop_message() 0258 return 0259 if self._cur.get_content_maintype() == 'multipart': 0260 boundary = self._cur.get_boundary() 0261 if boundary is None: 0262 # The message /claims/ to be a multipart but it has not 0263 # defined a boundary. That's a problem which we'll handle by 0264 # reading everything until the EOF and marking the message as 0265 # defective. 0266 self._cur.defects.append(Errors.NoBoundaryInMultipartDefect()) 0267 lines = [] 0268 for line in self._input: 0269 if line is NeedMoreData: 0270 yield NeedMoreData 0271 continue 0272 lines.append(line) 0273 self._cur.set_payload(EMPTYSTRING.join(lines)) 0274 return 0275 # Create a line match predicate which matches the inter-part 0276 # boundary as well as the end-of-multipart boundary. Don't push 0277 # this onto the input stream until we've scanned past the 0278 # preamble. 0279 separator = '--' + boundary 0280 boundaryre = re.compile( 0281 '(?P<sep>' + re.escape(separator) + 0282 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$') 0283 capturing_preamble = True 0284 preamble = [] 0285 linesep = False 0286 while True: 0287 line = self._input.readline() 0288 if line is NeedMoreData: 0289 yield NeedMoreData 0290 continue 0291 if line == '': 0292 break 0293 mo = boundaryre.match(line) 0294 if mo: 0295 # If we're looking at the end boundary, we're done with 0296 # this multipart. If there was a newline at the end of 0297 # the closing boundary, then we need to initialize the 0298 # epilogue with the empty string (see below). 0299 if mo.group('end'): 0300 linesep = mo.group('linesep') 0301 break 0302 # We saw an inter-part boundary. Were we in the preamble? 0303 if capturing_preamble: 0304 if preamble: 0305 # According to RFC 2046, the last newline belongs 0306 # to the boundary. 0307 lastline = preamble[-1] 0308 eolmo = NLCRE_eol.search(lastline) 0309 if eolmo: 0310 preamble[-1] = lastline[:-len(eolmo.group(0))] 0311 self._cur.preamble = EMPTYSTRING.join(preamble) 0312 capturing_preamble = False 0313 self._input.unreadline(line) 0314 continue 0315 # We saw a boundary separating two parts. Consume any 0316 # multiple boundary lines that may be following. Our 0317 # interpretation of RFC 2046 BNF grammar does not produce 0318 # body parts within such double boundaries. 0319 while True: 0320 line = self._input.readline() 0321 if line is NeedMoreData: 0322 yield NeedMoreData 0323 continue 0324 mo = boundaryre.match(line) 0325 if not mo: 0326 self._input.unreadline(line) 0327 break 0328 # Recurse to parse this subpart; the input stream points 0329 # at the subpart's first line. 0330 self._input.push_eof_matcher(boundaryre.match) 0331 for retval in self._parsegen(): 0332 if retval is NeedMoreData: 0333 yield NeedMoreData 0334 continue 0335 break 0336 # Because of RFC 2046, the newline preceding the boundary 0337 # separator actually belongs to the boundary, not the 0338 # previous subpart's payload (or epilogue if the previous 0339 # part is a multipart). 0340 if self._last.get_content_maintype() == 'multipart': 0341 epilogue = self._last.epilogue 0342 if epilogue == '': 0343 self._last.epilogue = None 0344 elif epilogue is not None: 0345 mo = NLCRE_eol.search(epilogue) 0346 if mo: 0347 end = len(mo.group(0)) 0348 self._last.epilogue = epilogue[:-end] 0349 else: 0350 payload = self._last.get_payload() 0351 if isinstance(payload, basestring): 0352 mo = NLCRE_eol.search(payload) 0353 if mo: 0354 payload = payload[:-len(mo.group(0))] 0355 self._last.set_payload(payload) 0356 self._input.pop_eof_matcher() 0357 self._pop_message() 0358 # Set the multipart up for newline cleansing, which will 0359 # happen if we're in a nested multipart. 0360 self._last = self._cur 0361 else: 0362 # I think we must be in the preamble 0363 assert capturing_preamble 0364 preamble.append(line) 0365 # We've seen either the EOF or the end boundary. If we're still 0366 # capturing the preamble, we never saw the start boundary. Note 0367 # that as a defect and store the captured text as the payload. 0368 # Everything from here to the EOF is epilogue. 0369 if capturing_preamble: 0370 self._cur.defects.append(Errors.StartBoundaryNotFoundDefect()) 0371 self._cur.set_payload(EMPTYSTRING.join(preamble)) 0372 epilogue = [] 0373 for line in self._input: 0374 if line is NeedMoreData: 0375 yield NeedMoreData 0376 continue 0377 self._cur.epilogue = EMPTYSTRING.join(epilogue) 0378 return 0379 # If the end boundary ended in a newline, we'll need to make sure 0380 # the epilogue isn't None 0381 if linesep: 0382 epilogue = [''] 0383 else: 0384 epilogue = [] 0385 for line in self._input: 0386 if line is NeedMoreData: 0387 yield NeedMoreData 0388 continue 0389 epilogue.append(line) 0390 # Any CRLF at the front of the epilogue is not technically part of 0391 # the epilogue. Also, watch out for an empty string epilogue, 0392 # which means a single newline. 0393 if epilogue: 0394 firstline = epilogue[0] 0395 bolmo = NLCRE_bol.match(firstline) 0396 if bolmo: 0397 epilogue[0] = firstline[len(bolmo.group(0)):] 0398 self._cur.epilogue = EMPTYSTRING.join(epilogue) 0399 return 0400 # Otherwise, it's some non-multipart type, so the entire rest of the 0401 # file contents becomes the payload. 0402 lines = [] 0403 for line in self._input: 0404 if line is NeedMoreData: 0405 yield NeedMoreData 0406 continue 0407 lines.append(line) 0408 self._cur.set_payload(EMPTYSTRING.join(lines)) 0409 0410 def _parse_headers(self, lines): 0411 # Passed a list of lines that make up the headers for the current msg 0412 lastheader = '' 0413 lastvalue = [] 0414 for lineno, line in enumerate(lines): 0415 # Check for continuation 0416 if line[0] in ' \t': 0417 if not lastheader: 0418 # The first line of the headers was a continuation. This 0419 # is illegal, so let's note the defect, store the illegal 0420 # line, and ignore it for purposes of headers. 0421 defect = Errors.FirstHeaderLineIsContinuationDefect(line) 0422 self._cur.defects.append(defect) 0423 continue 0424 lastvalue.append(line) 0425 continue 0426 if lastheader: 0427 # XXX reconsider the joining of folded lines 0428 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n') 0429 self._cur[lastheader] = lhdr 0430 lastheader, lastvalue = '', [] 0431 # Check for envelope header, i.e. unix-from 0432 if line.startswith('From '): 0433 if lineno == 0: 0434 # Strip off the trailing newline 0435 mo = NLCRE_eol.search(line) 0436 if mo: 0437 line = line[:-len(mo.group(0))] 0438 self._cur.set_unixfrom(line) 0439 continue 0440 elif lineno == len(lines) - 1: 0441 # Something looking like a unix-from at the end - it's 0442 # probably the first line of the body, so push back the 0443 # line and stop. 0444 self._input.unreadline(line) 0445 return 0446 else: 0447 # Weirdly placed unix-from line. Note this as a defect 0448 # and ignore it. 0449 defect = Errors.MisplacedEnvelopeHeaderDefect(line) 0450 self._cur.defects.append(defect) 0451 continue 0452 # Split the line on the colon separating field name from value. 0453 i = line.find(':') 0454 if i < 0: 0455 defect = Errors.MalformedHeaderDefect(line) 0456 self._cur.defects.append(defect) 0457 continue 0458 lastheader = line[:i] 0459 lastvalue = [line[i+1:].lstrip()] 0460 # Done with all the lines, so handle the last header. 0461 if lastheader: 0462 # XXX reconsider the joining of folded lines 0463 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n') 0464
Generated by PyXR 0.9.4