0001 0002 """ 0003 csv.py - read/write/investigate CSV files 0004 """ 0005 0006 import re 0007 from _csv import Error, __version__, writer, reader, register_dialect, \ 0008 unregister_dialect, get_dialect, list_dialects, \ 0009 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ 0010 __doc__ 0011 0012 try: 0013 from cStringIO import StringIO 0014 except ImportError: 0015 from StringIO import StringIO 0016 0017 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", 0018 "Error", "Dialect", "excel", "excel_tab", "reader", "writer", 0019 "register_dialect", "get_dialect", "list_dialects", "Sniffer", 0020 "unregister_dialect", "__version__", "DictReader", "DictWriter" ] 0021 0022 class Dialect: 0023 _name = "" 0024 _valid = False 0025 # placeholders 0026 delimiter = None 0027 quotechar = None 0028 escapechar = None 0029 doublequote = None 0030 skipinitialspace = None 0031 lineterminator = None 0032 quoting = None 0033 0034 def __init__(self): 0035 if self.__class__ != Dialect: 0036 self._valid = True 0037 errors = self._validate() 0038 if errors != []: 0039 raise Error, "Dialect did not validate: %s" % ", ".join(errors) 0040 0041 def _validate(self): 0042 errors = [] 0043 if not self._valid: 0044 errors.append("can't directly instantiate Dialect class") 0045 0046 if self.delimiter is None: 0047 errors.append("delimiter character not set") 0048 elif (not isinstance(self.delimiter, str) or 0049 len(self.delimiter) > 1): 0050 errors.append("delimiter must be one-character string") 0051 0052 if self.quotechar is None: 0053 if self.quoting != QUOTE_NONE: 0054 errors.append("quotechar not set") 0055 elif (not isinstance(self.quotechar, str) or 0056 len(self.quotechar) > 1): 0057 errors.append("quotechar must be one-character string") 0058 0059 if self.lineterminator is None: 0060 errors.append("lineterminator not set") 0061 elif not isinstance(self.lineterminator, str): 0062 errors.append("lineterminator must be a string") 0063 0064 if self.doublequote not in (True, False): 0065 errors.append("doublequote parameter must be True or False") 0066 0067 if self.skipinitialspace not in (True, False): 0068 errors.append("skipinitialspace parameter must be True or False") 0069 0070 if self.quoting is None: 0071 errors.append("quoting parameter not set") 0072 0073 if self.quoting is QUOTE_NONE: 0074 if (not isinstance(self.escapechar, (unicode, str)) or 0075 len(self.escapechar) > 1): 0076 errors.append("escapechar must be a one-character string or unicode object") 0077 0078 return errors 0079 0080 class excel(Dialect): 0081 delimiter = ',' 0082 quotechar = '"' 0083 doublequote = True 0084 skipinitialspace = False 0085 lineterminator = '\r\n' 0086 quoting = QUOTE_MINIMAL 0087 register_dialect("excel", excel) 0088 0089 class excel_tab(excel): 0090 delimiter = '\t' 0091 register_dialect("excel-tab", excel_tab) 0092 0093 0094 class DictReader: 0095 def __init__(self, f, fieldnames=None, restkey=None, restval=None, 0096 dialect="excel", *args, **kwds): 0097 self.fieldnames = fieldnames # list of keys for the dict 0098 self.restkey = restkey # key to catch long rows 0099 self.restval = restval # default value for short rows 0100 self.reader = reader(f, dialect, *args, **kwds) 0101 0102 def __iter__(self): 0103 return self 0104 0105 def next(self): 0106 row = self.reader.next() 0107 if self.fieldnames is None: 0108 self.fieldnames = row 0109 row = self.reader.next() 0110 0111 # unlike the basic reader, we prefer not to return blanks, 0112 # because we will typically wind up with a dict full of None 0113 # values 0114 while row == []: 0115 row = self.reader.next() 0116 d = dict(zip(self.fieldnames, row)) 0117 lf = len(self.fieldnames) 0118 lr = len(row) 0119 if lf < lr: 0120 d[self.restkey] = row[lf:] 0121 elif lf > lr: 0122 for key in self.fieldnames[lr:]: 0123 d[key] = self.restval 0124 return d 0125 0126 0127 class DictWriter: 0128 def __init__(self, f, fieldnames, restval="", extrasaction="raise", 0129 dialect="excel", *args, **kwds): 0130 self.fieldnames = fieldnames # list of keys for the dict 0131 self.restval = restval # for writing short dicts 0132 if extrasaction.lower() not in ("raise", "ignore"): 0133 raise ValueError, \ 0134 ("extrasaction (%s) must be 'raise' or 'ignore'" % 0135 extrasaction) 0136 self.extrasaction = extrasaction 0137 self.writer = writer(f, dialect, *args, **kwds) 0138 0139 def _dict_to_list(self, rowdict): 0140 if self.extrasaction == "raise": 0141 for k in rowdict.keys(): 0142 if k not in self.fieldnames: 0143 raise ValueError, "dict contains fields not in fieldnames" 0144 return [rowdict.get(key, self.restval) for key in self.fieldnames] 0145 0146 def writerow(self, rowdict): 0147 return self.writer.writerow(self._dict_to_list(rowdict)) 0148 0149 def writerows(self, rowdicts): 0150 rows = [] 0151 for rowdict in rowdicts: 0152 rows.append(self._dict_to_list(rowdict)) 0153 return self.writer.writerows(rows) 0154 0155 # Guard Sniffer's type checking against builds that exclude complex() 0156 try: 0157 complex 0158 except NameError: 0159 complex = float 0160 0161 class Sniffer: 0162 ''' 0163 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) 0164 Returns a Dialect object. 0165 ''' 0166 def __init__(self): 0167 # in case there is more than one possible delimiter 0168 self.preferred = [',', '\t', ';', ' ', ':'] 0169 0170 0171 def sniff(self, sample, delimiters=None): 0172 """ 0173 Returns a dialect (or None) corresponding to the sample 0174 """ 0175 0176 quotechar, delimiter, skipinitialspace = \ 0177 self._guess_quote_and_delimiter(sample, delimiters) 0178 if delimiter is None: 0179 delimiter, skipinitialspace = self._guess_delimiter(sample, 0180 delimiters) 0181 0182 class dialect(Dialect): 0183 _name = "sniffed" 0184 lineterminator = '\r\n' 0185 quoting = QUOTE_MINIMAL 0186 # escapechar = '' 0187 doublequote = False 0188 0189 dialect.delimiter = delimiter 0190 # _csv.reader won't accept a quotechar of '' 0191 dialect.quotechar = quotechar or '"' 0192 dialect.skipinitialspace = skipinitialspace 0193 0194 return dialect 0195 0196 0197 def _guess_quote_and_delimiter(self, data, delimiters): 0198 """ 0199 Looks for text enclosed between two identical quotes 0200 (the probable quotechar) which are preceded and followed 0201 by the same character (the probable delimiter). 0202 For example: 0203 ,'some text', 0204 The quote with the most wins, same with the delimiter. 0205 If there is no quotechar the delimiter can't be determined 0206 this way. 0207 """ 0208 0209 matches = [] 0210 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", 0211 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", 0212 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" 0213 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) 0214 regexp = re.compile(restr, re.DOTALL | re.MULTILINE) 0215 matches = regexp.findall(data) 0216 if matches: 0217 break 0218 0219 if not matches: 0220 return ('', None, 0) # (quotechar, delimiter, skipinitialspace) 0221 0222 quotes = {} 0223 delims = {} 0224 spaces = 0 0225 for m in matches: 0226 n = regexp.groupindex['quote'] - 1 0227 key = m[n] 0228 if key: 0229 quotes[key] = quotes.get(key, 0) + 1 0230 try: 0231 n = regexp.groupindex['delim'] - 1 0232 key = m[n] 0233 except KeyError: 0234 continue 0235 if key and (delimiters is None or key in delimiters): 0236 delims[key] = delims.get(key, 0) + 1 0237 try: 0238 n = regexp.groupindex['space'] - 1 0239 except KeyError: 0240 continue 0241 if m[n]: 0242 spaces += 1 0243 0244 quotechar = reduce(lambda a, b, quotes = quotes: 0245 (quotes[a] > quotes[b]) and a or b, quotes.keys()) 0246 0247 if delims: 0248 delim = reduce(lambda a, b, delims = delims: 0249 (delims[a] > delims[b]) and a or b, delims.keys()) 0250 skipinitialspace = delims[delim] == spaces 0251 if delim == '\n': # most likely a file with a single column 0252 delim = '' 0253 else: 0254 # there is *no* delimiter, it's a single column of quoted data 0255 delim = '' 0256 skipinitialspace = 0 0257 0258 return (quotechar, delim, skipinitialspace) 0259 0260 0261 def _guess_delimiter(self, data, delimiters): 0262 """ 0263 The delimiter /should/ occur the same number of times on 0264 each row. However, due to malformed data, it may not. We don't want 0265 an all or nothing approach, so we allow for small variations in this 0266 number. 0267 1) build a table of the frequency of each character on every line. 0268 2) build a table of freqencies of this frequency (meta-frequency?), 0269 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, 0270 7 times in 2 rows' 0271 3) use the mode of the meta-frequency to determine the /expected/ 0272 frequency for that character 0273 4) find out how often the character actually meets that goal 0274 5) the character that best meets its goal is the delimiter 0275 For performance reasons, the data is evaluated in chunks, so it can 0276 try and evaluate the smallest portion of the data possible, evaluating 0277 additional chunks as necessary. 0278 """ 0279 0280 data = filter(None, data.split('\n')) 0281 0282 ascii = [chr(c) for c in range(127)] # 7-bit ASCII 0283 0284 # build frequency tables 0285 chunkLength = min(10, len(data)) 0286 iteration = 0 0287 charFrequency = {} 0288 modes = {} 0289 delims = {} 0290 start, end = 0, min(chunkLength, len(data)) 0291 while start < len(data): 0292 iteration += 1 0293 for line in data[start:end]: 0294 for char in ascii: 0295 metaFrequency = charFrequency.get(char, {}) 0296 # must count even if frequency is 0 0297 freq = line.strip().count(char) 0298 # value is the mode 0299 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 0300 charFrequency[char] = metaFrequency 0301 0302 for char in charFrequency.keys(): 0303 items = charFrequency[char].items() 0304 if len(items) == 1 and items[0][0] == 0: 0305 continue 0306 # get the mode of the frequencies 0307 if len(items) > 1: 0308 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, 0309 items) 0310 # adjust the mode - subtract the sum of all 0311 # other frequencies 0312 items.remove(modes[char]) 0313 modes[char] = (modes[char][0], modes[char][1] 0314 - reduce(lambda a, b: (0, a[1] + b[1]), 0315 items)[1]) 0316 else: 0317 modes[char] = items[0] 0318 0319 # build a list of possible delimiters 0320 modeList = modes.items() 0321 total = float(chunkLength * iteration) 0322 # (rows of consistent data) / (number of rows) = 100% 0323 consistency = 1.0 0324 # minimum consistency threshold 0325 threshold = 0.9 0326 while len(delims) == 0 and consistency >= threshold: 0327 for k, v in modeList: 0328 if v[0] > 0 and v[1] > 0: 0329 if ((v[1]/total) >= consistency and 0330 (delimiters is None or k in delimiters)): 0331 delims[k] = v 0332 consistency -= 0.01 0333 0334 if len(delims) == 1: 0335 delim = delims.keys()[0] 0336 skipinitialspace = (data[0].count(delim) == 0337 data[0].count("%c " % delim)) 0338 return (delim, skipinitialspace) 0339 0340 # analyze another chunkLength lines 0341 start = end 0342 end += chunkLength 0343 0344 if not delims: 0345 return ('', 0) 0346 0347 # if there's more than one, fall back to a 'preferred' list 0348 if len(delims) > 1: 0349 for d in self.preferred: 0350 if d in delims.keys(): 0351 skipinitialspace = (data[0].count(d) == 0352 data[0].count("%c " % d)) 0353 return (d, skipinitialspace) 0354 0355 # finally, just return the first damn character in the list 0356 delim = delims.keys()[0] 0357 skipinitialspace = (data[0].count(delim) == 0358 data[0].count("%c " % delim)) 0359 return (delim, skipinitialspace) 0360 0361 0362 def has_header(self, sample): 0363 # Creates a dictionary of types of data in each column. If any 0364 # column is of a single type (say, integers), *except* for the first 0365 # row, then the first row is presumed to be labels. If the type 0366 # can't be determined, it is assumed to be a string in which case 0367 # the length of the string is the determining factor: if all of the 0368 # rows except for the first are the same length, it's a header. 0369 # Finally, a 'vote' is taken at the end for each column, adding or 0370 # subtracting from the likelihood of the first row being a header. 0371 0372 rdr = reader(StringIO(sample), self.sniff(sample)) 0373 0374 header = rdr.next() # assume first row is header 0375 0376 columns = len(header) 0377 columnTypes = {} 0378 for i in range(columns): columnTypes[i] = None 0379 0380 checked = 0 0381 for row in rdr: 0382 # arbitrary number of rows to check, to keep it sane 0383 if checked > 20: 0384 break 0385 checked += 1 0386 0387 if len(row) != columns: 0388 continue # skip rows that have irregular number of columns 0389 0390 for col in columnTypes.keys(): 0391 0392 for thisType in [int, long, float, complex]: 0393 try: 0394 thisType(row[col]) 0395 break 0396 except (ValueError, OverflowError): 0397 pass 0398 else: 0399 # fallback to length of string 0400 thisType = len(row[col]) 0401 0402 # treat longs as ints 0403 if thisType == long: 0404 thisType = int 0405 0406 if thisType != columnTypes[col]: 0407 if columnTypes[col] is None: # add new column type 0408 columnTypes[col] = thisType 0409 else: 0410 # type is inconsistent, remove column from 0411 # consideration 0412 del columnTypes[col] 0413 0414 # finally, compare results against first row and "vote" 0415 # on whether it's a header 0416 hasHeader = 0 0417 for col, colType in columnTypes.items(): 0418 if type(colType) == type(0): # it's a length 0419 if len(header[col]) != colType: 0420 hasHeader += 1 0421 else: 0422 hasHeader -= 1 0423 else: # attempt typecast 0424 try: 0425 colType(header[col]) 0426 except (ValueError, TypeError): 0427 hasHeader += 1 0428 else: 0429 hasHeader -= 1 0430 0431 return hasHeader > 0 0432
Generated by PyXR 0.9.4