Source code for bag.spreadsheet.excel

"""Easily import an Excel spreadsheet with headers on the top row."""

from zipfile import BadZipFile
from openpyxl import load_workbook  # pip install openpyxl
from openpyxl.utils.exceptions import InvalidFileException
from bag.web.exceptions import Problem

try:
    from bag.web.pyramid import _
except ImportError:
    _ = str  # and i18n is disabled.
from . import (
    get_corresponding_variable_names,
    raise_if_missing_required_headers,
    raise_if_forbidden_headers,
)


[docs]def excel_reader(
    stream,
    worksheet_name=None,
    required_headers=[],
    forbidden_headers=[],
):
    """Read an XLSX file (from ``stream``) and yield objects.

    Objects? Yes, so you can access the values conveniently.

    You can pass in the ``worksheet_name`` to be read.  If not passed in or
    not present in the file, the first worksheet will be read.

    In addition, you may pass a sequence of *required_headers*, and if they
    aren't all present, KeyError is raised.

    Let's see an example. Suppose you are reading some Excel file and
    all you know is it contains the columns "E-mail", "Full Name" and
    "Gender", not necessarily in that order::

        reader = excel_reader(
            open('contacts.xlsx', mode='rb'),
            worksheet_name='Mailing',
            required_headers=['E-mail', 'Full Name', 'Gender'])
        for o in reader:
            print(o.full_name, o.e_mail, o.gender)
    """
    try:
        wb = load_workbook(stream, data_only=True)
    except (BadZipFile, InvalidFileException, KeyError) as e:
        raise Problem(
            _("That is not an XLSX file."),
            error_title=_("Unable to read the XLSX file"),
            error_debug=str(e),
        )

    # Grab either the worksheet named "Assets", or simply the first one
    if worksheet_name and worksheet_name in wb:
        sheet = wb[worksheet_name]
    else:
        sheet = wb[wb.sheetnames[0]]

    this_is_the_first_row = True
    for row in sheet.rows:
        if this_is_the_first_row:  # Read and validate the headers
            this_is_the_first_row = False
            headers = [cell.value for cell in row]
            headers = [h.strip() if isinstance(h, str) else h for h in headers]
            raise_if_missing_required_headers(headers, required_headers)
            raise_if_forbidden_headers(headers, forbidden_headers)
            vars = get_corresponding_variable_names(headers, required_headers)
            index_of_var = {var: i for i, var in enumerate(vars)}

            class SpreadsheetRow:
                """A view on a spreadsheet row.

                You can access data as if they were instance variables.
                """

                __slots__ = ("__cells",)

                def __init__(self, cells):
                    self.__cells = cells

                def __getattr__(self, attr):
                    content = self.__cells[index_of_var[attr]].value
                    return content

        else:
            yield SpreadsheetRow(row)