PyXR

c:\python24\lib \ csv.py


0001 
0002 """
0003 csv.py - read/write/investigate CSV files
0004 """
0005 
0006 import re
0007 from _csv import Error, __version__, writer, reader, register_dialect, \
0008                  unregister_dialect, get_dialect, list_dialects, \
0009                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
0010                  __doc__
0011 
0012 try:
0013     from cStringIO import StringIO
0014 except ImportError:
0015     from StringIO import StringIO
0016 
0017 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
0018             "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
0019             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
0020             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
0021 
0022 class Dialect:
0023     _name = ""
0024     _valid = False
0025     # placeholders
0026     delimiter = None
0027     quotechar = None
0028     escapechar = None
0029     doublequote = None
0030     skipinitialspace = None
0031     lineterminator = None
0032     quoting = None
0033 
0034     def __init__(self):
0035         if self.__class__ != Dialect:
0036             self._valid = True
0037         errors = self._validate()
0038         if errors != []:
0039             raise Error, "Dialect did not validate: %s" % ", ".join(errors)
0040 
0041     def _validate(self):
0042         errors = []
0043         if not self._valid:
0044             errors.append("can't directly instantiate Dialect class")
0045 
0046         if self.delimiter is None:
0047             errors.append("delimiter character not set")
0048         elif (not isinstance(self.delimiter, str) or
0049               len(self.delimiter) > 1):
0050             errors.append("delimiter must be one-character string")
0051 
0052         if self.quotechar is None:
0053             if self.quoting != QUOTE_NONE:
0054                 errors.append("quotechar not set")
0055         elif (not isinstance(self.quotechar, str) or
0056               len(self.quotechar) > 1):
0057             errors.append("quotechar must be one-character string")
0058 
0059         if self.lineterminator is None:
0060             errors.append("lineterminator not set")
0061         elif not isinstance(self.lineterminator, str):
0062             errors.append("lineterminator must be a string")
0063 
0064         if self.doublequote not in (True, False):
0065             errors.append("doublequote parameter must be True or False")
0066 
0067         if self.skipinitialspace not in (True, False):
0068             errors.append("skipinitialspace parameter must be True or False")
0069 
0070         if self.quoting is None:
0071             errors.append("quoting parameter not set")
0072 
0073         if self.quoting is QUOTE_NONE:
0074             if (not isinstance(self.escapechar, (unicode, str)) or
0075                 len(self.escapechar) > 1):
0076                 errors.append("escapechar must be a one-character string or unicode object")
0077 
0078         return errors
0079 
0080 class excel(Dialect):
0081     delimiter = ','
0082     quotechar = '"'
0083     doublequote = True
0084     skipinitialspace = False
0085     lineterminator = '\r\n'
0086     quoting = QUOTE_MINIMAL
0087 register_dialect("excel", excel)
0088 
0089 class excel_tab(excel):
0090     delimiter = '\t'
0091 register_dialect("excel-tab", excel_tab)
0092 
0093 
0094 class DictReader:
0095     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
0096                  dialect="excel", *args, **kwds):
0097         self.fieldnames = fieldnames    # list of keys for the dict
0098         self.restkey = restkey          # key to catch long rows
0099         self.restval = restval          # default value for short rows
0100         self.reader = reader(f, dialect, *args, **kwds)
0101 
0102     def __iter__(self):
0103         return self
0104 
0105     def next(self):
0106         row = self.reader.next()
0107         if self.fieldnames is None:
0108             self.fieldnames = row
0109             row = self.reader.next()
0110 
0111         # unlike the basic reader, we prefer not to return blanks,
0112         # because we will typically wind up with a dict full of None
0113         # values
0114         while row == []:
0115             row = self.reader.next()
0116         d = dict(zip(self.fieldnames, row))
0117         lf = len(self.fieldnames)
0118         lr = len(row)
0119         if lf < lr:
0120             d[self.restkey] = row[lf:]
0121         elif lf > lr:
0122             for key in self.fieldnames[lr:]:
0123                 d[key] = self.restval
0124         return d
0125 
0126 
0127 class DictWriter:
0128     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
0129                  dialect="excel", *args, **kwds):
0130         self.fieldnames = fieldnames    # list of keys for the dict
0131         self.restval = restval          # for writing short dicts
0132         if extrasaction.lower() not in ("raise", "ignore"):
0133             raise ValueError, \
0134                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
0135                    extrasaction)
0136         self.extrasaction = extrasaction
0137         self.writer = writer(f, dialect, *args, **kwds)
0138 
0139     def _dict_to_list(self, rowdict):
0140         if self.extrasaction == "raise":
0141             for k in rowdict.keys():
0142                 if k not in self.fieldnames:
0143                     raise ValueError, "dict contains fields not in fieldnames"
0144         return [rowdict.get(key, self.restval) for key in self.fieldnames]
0145 
0146     def writerow(self, rowdict):
0147         return self.writer.writerow(self._dict_to_list(rowdict))
0148 
0149     def writerows(self, rowdicts):
0150         rows = []
0151         for rowdict in rowdicts:
0152             rows.append(self._dict_to_list(rowdict))
0153         return self.writer.writerows(rows)
0154 
0155 # Guard Sniffer's type checking against builds that exclude complex()
0156 try:
0157     complex
0158 except NameError:
0159     complex = float
0160 
0161 class Sniffer:
0162     '''
0163     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
0164     Returns a Dialect object.
0165     '''
0166     def __init__(self):
0167         # in case there is more than one possible delimiter
0168         self.preferred = [',', '\t', ';', ' ', ':']
0169 
0170 
0171     def sniff(self, sample, delimiters=None):
0172         """
0173         Returns a dialect (or None) corresponding to the sample
0174         """
0175 
0176         quotechar, delimiter, skipinitialspace = \
0177                    self._guess_quote_and_delimiter(sample, delimiters)
0178         if delimiter is None:
0179             delimiter, skipinitialspace = self._guess_delimiter(sample,
0180                                                                 delimiters)
0181 
0182         class dialect(Dialect):
0183             _name = "sniffed"
0184             lineterminator = '\r\n'
0185             quoting = QUOTE_MINIMAL
0186             # escapechar = ''
0187             doublequote = False
0188 
0189         dialect.delimiter = delimiter
0190         # _csv.reader won't accept a quotechar of ''
0191         dialect.quotechar = quotechar or '"'
0192         dialect.skipinitialspace = skipinitialspace
0193 
0194         return dialect
0195 
0196 
0197     def _guess_quote_and_delimiter(self, data, delimiters):
0198         """
0199         Looks for text enclosed between two identical quotes
0200         (the probable quotechar) which are preceded and followed
0201         by the same character (the probable delimiter).
0202         For example:
0203                          ,'some text',
0204         The quote with the most wins, same with the delimiter.
0205         If there is no quotechar the delimiter can't be determined
0206         this way.
0207         """
0208 
0209         matches = []
0210         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
0211                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
0212                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
0213                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
0214             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
0215             matches = regexp.findall(data)
0216             if matches:
0217                 break
0218 
0219         if not matches:
0220             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
0221 
0222         quotes = {}
0223         delims = {}
0224         spaces = 0
0225         for m in matches:
0226             n = regexp.groupindex['quote'] - 1
0227             key = m[n]
0228             if key:
0229                 quotes[key] = quotes.get(key, 0) + 1
0230             try:
0231                 n = regexp.groupindex['delim'] - 1
0232                 key = m[n]
0233             except KeyError:
0234                 continue
0235             if key and (delimiters is None or key in delimiters):
0236                 delims[key] = delims.get(key, 0) + 1
0237             try:
0238                 n = regexp.groupindex['space'] - 1
0239             except KeyError:
0240                 continue
0241             if m[n]:
0242                 spaces += 1
0243 
0244         quotechar = reduce(lambda a, b, quotes = quotes:
0245                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
0246 
0247         if delims:
0248             delim = reduce(lambda a, b, delims = delims:
0249                            (delims[a] > delims[b]) and a or b, delims.keys())
0250             skipinitialspace = delims[delim] == spaces
0251             if delim == '\n': # most likely a file with a single column
0252                 delim = ''
0253         else:
0254             # there is *no* delimiter, it's a single column of quoted data
0255             delim = ''
0256             skipinitialspace = 0
0257 
0258         return (quotechar, delim, skipinitialspace)
0259 
0260 
0261     def _guess_delimiter(self, data, delimiters):
0262         """
0263         The delimiter /should/ occur the same number of times on
0264         each row. However, due to malformed data, it may not. We don't want
0265         an all or nothing approach, so we allow for small variations in this
0266         number.
0267           1) build a table of the frequency of each character on every line.
0268           2) build a table of freqencies of this frequency (meta-frequency?),
0269              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
0270              7 times in 2 rows'
0271           3) use the mode of the meta-frequency to determine the /expected/
0272              frequency for that character
0273           4) find out how often the character actually meets that goal
0274           5) the character that best meets its goal is the delimiter
0275         For performance reasons, the data is evaluated in chunks, so it can
0276         try and evaluate the smallest portion of the data possible, evaluating
0277         additional chunks as necessary.
0278         """
0279 
0280         data = filter(None, data.split('\n'))
0281 
0282         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
0283 
0284         # build frequency tables
0285         chunkLength = min(10, len(data))
0286         iteration = 0
0287         charFrequency = {}
0288         modes = {}
0289         delims = {}
0290         start, end = 0, min(chunkLength, len(data))
0291         while start < len(data):
0292             iteration += 1
0293             for line in data[start:end]:
0294                 for char in ascii:
0295                     metaFrequency = charFrequency.get(char, {})
0296                     # must count even if frequency is 0
0297                     freq = line.strip().count(char)
0298                     # value is the mode
0299                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
0300                     charFrequency[char] = metaFrequency
0301 
0302             for char in charFrequency.keys():
0303                 items = charFrequency[char].items()
0304                 if len(items) == 1 and items[0][0] == 0:
0305                     continue
0306                 # get the mode of the frequencies
0307                 if len(items) > 1:
0308                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
0309                                          items)
0310                     # adjust the mode - subtract the sum of all
0311                     # other frequencies
0312                     items.remove(modes[char])
0313                     modes[char] = (modes[char][0], modes[char][1]
0314                                    - reduce(lambda a, b: (0, a[1] + b[1]),
0315                                             items)[1])
0316                 else:
0317                     modes[char] = items[0]
0318 
0319             # build a list of possible delimiters
0320             modeList = modes.items()
0321             total = float(chunkLength * iteration)
0322             # (rows of consistent data) / (number of rows) = 100%
0323             consistency = 1.0
0324             # minimum consistency threshold
0325             threshold = 0.9
0326             while len(delims) == 0 and consistency >= threshold:
0327                 for k, v in modeList:
0328                     if v[0] > 0 and v[1] > 0:
0329                         if ((v[1]/total) >= consistency and
0330                             (delimiters is None or k in delimiters)):
0331                             delims[k] = v
0332                 consistency -= 0.01
0333 
0334             if len(delims) == 1:
0335                 delim = delims.keys()[0]
0336                 skipinitialspace = (data[0].count(delim) ==
0337                                     data[0].count("%c " % delim))
0338                 return (delim, skipinitialspace)
0339 
0340             # analyze another chunkLength lines
0341             start = end
0342             end += chunkLength
0343 
0344         if not delims:
0345             return ('', 0)
0346 
0347         # if there's more than one, fall back to a 'preferred' list
0348         if len(delims) > 1:
0349             for d in self.preferred:
0350                 if d in delims.keys():
0351                     skipinitialspace = (data[0].count(d) ==
0352                                         data[0].count("%c " % d))
0353                     return (d, skipinitialspace)
0354 
0355         # finally, just return the first damn character in the list
0356         delim = delims.keys()[0]
0357         skipinitialspace = (data[0].count(delim) ==
0358                             data[0].count("%c " % delim))
0359         return (delim, skipinitialspace)
0360 
0361 
0362     def has_header(self, sample):
0363         # Creates a dictionary of types of data in each column. If any
0364         # column is of a single type (say, integers), *except* for the first
0365         # row, then the first row is presumed to be labels. If the type
0366         # can't be determined, it is assumed to be a string in which case
0367         # the length of the string is the determining factor: if all of the
0368         # rows except for the first are the same length, it's a header.
0369         # Finally, a 'vote' is taken at the end for each column, adding or
0370         # subtracting from the likelihood of the first row being a header.
0371 
0372         rdr = reader(StringIO(sample), self.sniff(sample))
0373 
0374         header = rdr.next() # assume first row is header
0375 
0376         columns = len(header)
0377         columnTypes = {}
0378         for i in range(columns): columnTypes[i] = None
0379 
0380         checked = 0
0381         for row in rdr:
0382             # arbitrary number of rows to check, to keep it sane
0383             if checked > 20:
0384                 break
0385             checked += 1
0386 
0387             if len(row) != columns:
0388                 continue # skip rows that have irregular number of columns
0389 
0390             for col in columnTypes.keys():
0391 
0392                 for thisType in [int, long, float, complex]:
0393                     try:
0394                         thisType(row[col])
0395                         break
0396                     except (ValueError, OverflowError):
0397                         pass
0398                 else:
0399                     # fallback to length of string
0400                     thisType = len(row[col])
0401 
0402                 # treat longs as ints
0403                 if thisType == long:
0404                     thisType = int
0405 
0406                 if thisType != columnTypes[col]:
0407                     if columnTypes[col] is None: # add new column type
0408                         columnTypes[col] = thisType
0409                     else:
0410                         # type is inconsistent, remove column from
0411                         # consideration
0412                         del columnTypes[col]
0413 
0414         # finally, compare results against first row and "vote"
0415         # on whether it's a header
0416         hasHeader = 0
0417         for col, colType in columnTypes.items():
0418             if type(colType) == type(0): # it's a length
0419                 if len(header[col]) != colType:
0420                     hasHeader += 1
0421                 else:
0422                     hasHeader -= 1
0423             else: # attempt typecast
0424                 try:
0425                     colType(header[col])
0426                 except (ValueError, TypeError):
0427                     hasHeader += 1
0428                 else:
0429                     hasHeader -= 1
0430 
0431         return hasHeader > 0
0432
Generated by PyXR 0.9.4