"""Easily import a CSV file with headers on the top row.
The most important things here are:
- The :py:func:`csv_with_headers_reader` generator
- The :py:class:`DecodingCsvWithHeaders` class
"""
from codecs import BOM_UTF8, BOM_UTF16
import csv
from . import (
get_corresponding_variable_names,
raise_if_missing_required_headers,
raise_if_forbidden_headers,
)
[docs]def decoding(stream, encoding="utf8"):
"""Wrap a stream that yields bytes in order to decode it.
If you have a stream that yields bytes, use this wrapper to decode them
into str objects. Example::
f = open('filepath.csv', 'br')
for line in decode(f, encoding='utf8'):
print(line)
f.close()
This generator removes the UTF8 BOM if the file contains it.
http://en.wikipedia.org/wiki/Byte_order_mark
"""
line = stream.readline()
# Python is buggy, it removes other BOMs but not the UTF8 one.
if line:
if line.startswith(BOM_UTF8):
line = line[len(BOM_UTF8) :]
encoding = "utf8"
elif line.startswith(BOM_UTF16):
encoding = "utf16"
yield line.decode(encoding)
while True: # eventually, StopIteration is raised by *stream*
line = stream.readline()
if not line:
return
yield line.decode(encoding)
[docs]def setup_reader(stream, required_headers=[], forbidden_headers=[], **k):
c = csv.reader(stream, **k)
def readline():
return c.__next__()
headers = [h.strip() if isinstance(h, str) else h for h in readline()]
raise_if_missing_required_headers(headers, required_headers)
raise_if_forbidden_headers(headers, forbidden_headers)
vars = get_corresponding_variable_names(headers, required_headers)
class CsvRow:
__slots__ = vars
def __init__(self, vals):
for i, h in enumerate(vars):
setattr(self, h, vals[i].strip())
return c, readline, vars, CsvRow
[docs]def buffered_csv_writing(rows, encoding="utf8", headers=None, buffer_rows=50):
"""Generate CSV lines using a buffer of size *buffer_rows*.
The values for the first CSV line may be provided as *headers*, and
the remaining ones as *rows*, which is preferrably another generator.
For instance, in Pyramid you might have a view like this::
return Response(content_type='text/csv', app_iter=buffered_csv_writing(
rows=my_generator, headers=['name', 'email'], buffer_rows=50))
"""
from io import StringIO
buf = StringIO()
writer = csv.writer(buf)
if headers:
writer.writerow(headers)
for i, row in enumerate(rows):
writer.writerow(row)
if i % buffer_rows == 0:
yield buf.getvalue().encode(encoding)
buf.truncate(0) # But in Python 3, truncate() does not move
buf.seek(0) # the file pointer, so we seek(0) explicitly.
yield buf.getvalue().encode(encoding)
buf.close()
[docs]def pyramid_download_csv(response, file_title, rows, encoding="utf8", **k): # noqa
response.headers["Content-Type"] = "text/csv"
response.headers["Content-Disposition"] = content_disposition_value(
"{}.{}.csv".format(file_title, encoding)
)
response.app_iter = buffered_csv_writing(rows, encoding=encoding, **k)
return response
[docs]def content_disposition_value(file_name):
"""Return the value of a Content-Disposition HTTP header."""
return 'attachment;filename="{}"'.format(file_name.replace('"', "_"))
"""
Downloading a large CSV file in a web app
=========================================
::
<mgedmin> I wonder if using app_iter results in less work
(no need to ''.join() the stringio's internal buffer), or *more* work
(it ''.joins() and then splits by '\n')
<mgedmin> a microbenchmark of app_iter=buf versus body=buf.getvalue()
would be interesting to see
<nandoflorestan> mgedmin, the second option wouldn't make sense, I think
<mgedmin> nandoflorestan, I find your faith in Python's stdlib charming
<mgedmin> /usr/lib/python2.7/StringIO.py: __iter__() returns self,
next() returns self.readline(),
self.readline does self.buf += ''.join(self.buflist)
<mgedmin> maybe cStringIO is more optimized
<mgedmin> cStringIO in py2.6 doesn't have a buflist, afaics, just a buf
<mgedmin> so it's just creating many substrings unnecessarily
<mgedmin> verdict: app_iter creates unnecessary work
<nandoflorestan> no, the fault isn't with app_iter
<mgedmin> hm, a proper app-iter based solution for
dynamically generating gigs of CSV data would be interesting
<nandoflorestan> well
<mgedmin> would it be possible to use the stdlib csv module in any way,
I wonder?
<nandoflorestan> yes
<nandoflorestan> it supports file-like objects,
<mgedmin> but app-iter is pull-based, and csv is push-based
<nandoflorestan> of which file and StringIO are "subclasses"
<mgedmin> well, duh, worst case you can treat each set of,
say, 100 lines, as a separate csv and just let app_iter concatenate those
<nandoflorestan> The problem you have found is really with
using StringIO at all
<nandoflorestan> Just use generators all the way instead
<nandoflorestan> this make sense?
<mgedmin> so the question is, I suppose: is it possible to make use of
the stdlib csv modules knowledge of various CSV dialects and
CSV escaping rules, when you're rolling your own csv generator?
* mgedmin isn't solving a real problem, just having thought experiments,BTW
<mgedmin> mnemoc is the one who needs to generate CSV in a Pyramid view
and was asking about the best way to do that
<mgedmin> there were no mentions of the multi-gigabyte data sets
<nandoflorestan> yes, it is possible, because csv supports a file-like
interface which must give it one CSV line at a time.
<nandoflorestan> the csv module does not require that one use StringIO.
<mgedmin> so?
<mgedmin> pyramid doesn't have anything like response.write(bunch_of_data)
<mgedmin> and you can't yield across multiple functions
<nandoflorestan> 1)that's what app_iter is for
<nandoflorestan> 2) don't understand
<mgedmin> I'm curious how you would use csv with app_iter, that's all
<nandoflorestan> buffer = StringIO()
<nandoflorestan> writer = CsvWriter(buffer)
<nandoflorestan> response.headers["Content-Type"] = "text/plain"
<nandoflorestan> response.headers["Content-Disposition"] = \
"attachment;filename=" + blahblah
* mgedmin comes up with this: http://pastie.org/3471360
<nandoflorestan> mgedmin, exactly. truncate(0)
<mgedmin> ok
<mgedmin> I hoped maybe you knew a better solution
<nandoflorestan> sorry :)
<mgedmin> actually something like http://pastie.org/3471360 might be
better -- larger buffers, don't yield after every single line
<mgedmin> 100 is probably too small but whatever, it's proof of concept
<nandoflorestan> very nice.
from io import StringIO
import csv
def csv_view(request): # by mgedmin
buf = StringIO()
writer = csv.writer(buf)
def csvdata():
for n in range(100000):
writer.writerow(['fake', 'csv', 'line', str(n)])
if n % 100 == 99:
yield buf.getvalue()
buf.truncate(0)
yield buf.getvalue()
return Response(content_type='text/csv', app_iter=csvdata)
"""