Source code for pyjams.fsread

#!/usr/bin/env python
"""
Read numbers and strings from a file into 2D float and string arrays

This module was written by Matthias Cuntz while at Department of
Computational Hydrosystems, Helmholtz Centre for Environmental
Research - UFZ, Leipzig, Germany, and continued while at Institut
National de Recherche pour l'Agriculture, l'Alimentation et
l'Environnement (INRAE), Nancy, France.

:copyright: Copyright 2009-2022 Matthias Cuntz, see AUTHORS.rst for details.
:license: MIT License, see LICENSE for details.

.. moduleauthor:: Matthias Cuntz

The following functions are provided

.. autosummary::
   fsread
   fread
   sread
   xread
   xlsread
   xlsxread

History
    * Written fread and sread Jul 2009 by
      Matthias Cuntz (mc (at) macu (dot) de)
    * Keyword transpose, Feb 2012, Matthias Cuntz
    * Ported to Python 3, Feb 2013, Matthias Cuntz
    * Removed bug when nc is list and contains 0, Nov 2014, Matthias Cuntz
    * Keyword hskip, Nov 2014, Matthias Cuntz
    * Do not use function lif, Feb 2015, Matthias Cuntz
    * nc can be tuple, Feb 2015, Matthias Cuntz
    * Large rewrite of code to improve speed: keep everything list until
      the very end, Feb 2015, Matthias Cuntz
    * Written fsread Feb 2015 by
      Matthias Cuntz (mc (at) macu (dot) de)
    * nc<=-1 removed in case of nc is list, Nov 2016, Matthias Cuntz
    * Added xread from modifying fsread, Feb 2017, Matthias Cuntz
    * range instead of np.arange, Nov 2017, Matthias Cuntz
    * Keywords cname, sname, hstrip, rename file to infile,
      Nov 2017, Matthias Cuntz
    * full_header=True returns vector of strings, Nov 2017, Matthias Cuntz
    * NA -> NaN, i.e. R to Python convention in xread, Feb 2019, Matthias Cuntz
    * Ignore unicode characters on read, Jun 2019, Matthias Cuntz
    * Make ignoring unicode characters campatible with Python 2 and Python 3,
      Jul 2019, Matthias Cuntz
    * Keywords encoding, errors with codecs module, Aug 2019, Matthias Cuntz
    * Return as list keyword, Dec 2019, Stephan Thober
    * Return as array as default, Jan 2020, Matthias Cuntz
    * Using numpy docstring format, May 2020, Matthias Cuntz
    * Use openpyxl for xlsx files in xread, Jul 2020, Matthias Cuntz
    * flake8 compatible xread, Jul 2020, Matthias Cuntz
    * flake8 compatible fsread, Mar 2021, Matthias Cuntz
    * Preserve trailing whitespace in column delimiters,
      Mar 2021, Matthias Cuntz
    * Code refactoring, Sep 2021, Matthias Cuntz
    * Cleaner code by using local functions, Dec 2021, Matthias Cuntz
    * Make float and string code symmetric in behaviour,
      Dec 2021, Matthias Cuntz
    * Always return float and string in fsread, Dec 2021, Matthias Cuntz
    * Removed reform option, Dec 2021, Matthias Cuntz
    * Return always string array if not return as list option is set;
      strarr is only used with header=True now, Dec 2021, Matthias Cuntz
    * fread and sread are simple calls of fsread, Dec 2021, Matthias Cuntz
    * header returns also 2D arrays by default, Dec 2021, Matthias Cuntz
    * More consistent docstrings, Jan 2022, Matthias Cuntz
    * Merged xread into module, Jan 2022, Matthias Cuntz
    * Use iterators to read rows in Excel file, Jan 2022, Matthias Cuntz
    * Always close open files, Jan 2022, Matthias Cuntz
    * Default fill_value is NaN, Jan 2022, Matthias Cuntz
    * Remove read_only mode for openpyxl because closing is disabled
      in this case, Jan 2022, Matthias Cuntz
    * NA -> NaN, i.e. R to Python convention in fsread,
      Aug 2022, Matthias Cuntz
    * Correct docstring of strip keyword, Mar 2023, Matthias Cuntz

"""
import codecs
import numpy as np


__all__ = ['fsread', 'fread', 'sread',
           'xread', 'xlsread', 'xlsxread']


# --------------------------------------------------------------------


def _read_head(f, skip=0, hskip=0):
    '''
    Return the *skip-hskip* lines after the first *hskip* lines as header
    from text file

    Parameters
    ----------
    f : file handle
        Open file handle such as codecs.StreamReaderWriter
    skip : int, optional
        Number of lines to skip at the beginning of file (default: 0)
    hskip : int, optional
        Number of lines in skip that do not belong to header (default: 0)

    Returns
    -------
    list
        List with strings of file header

    '''
    head = []
    # Skip lines
    if hskip > 0:
        ihskip = 0
        while ihskip < hskip:
            tmp = f.readline()
            ihskip += 1
    # Read header
    if skip > 0:
        head = [''] * (skip - hskip)
        iskip = 0
        while iskip < (skip - hskip):
            head[iskip] = str(f.readline().rstrip('\r\n'))
            iskip += 1
    return head


def _xread_get_iter_rows(sh, ixls=False):
    '''
    Get iterator for rows in Excel sheet

    Parameters
    ----------
    sh : handle of Excel sheet
        Handle to sheet in open Excel file
    ixls : bool, optional
        Use xlrd if True, otherwise use openpyxl (default)

    Returns
    -------
    iterator

    '''
    if ixls:
        rows = sh.get_rows()
    else:
        rows = sh.rows
    return rows


def _xread_next_row(rows):
    '''
    Get next row in Excel sheet

    Parameters
    ----------
    rows : iterator, generator
        Iterator of rows in Excel sheet

    Returns
    -------
    list
        List with strings of values in next row

    '''
    row = rows.__next__()
    return [ str(cc.value) for cc in row ]


def _xread_head(rows, skip=0, hskip=0):
    '''
    Return the *skip-hskip* lines after the first *hskip* lines as header
    from Excel sheet

    Parameters
    ----------
    sh : Excel sheet handle
        Handle to sheet in open Excel file
    skip : int, optional
        Number of lines to skip at the beginning of file (default: 0)
    hskip : int, optional
        Number of lines in skip that do not belong to header (default: 0)

    Returns
    -------
    list
        List with strings of file header

    '''
    head = []
    # Read header
    if skip > 0:
        for iskip in range(hskip):
            _ = _xread_next_row(rows)
        for iskip in range(skip - hskip):
            head.append(_xread_next_row(rows))
    return head


def _close_file(f, ixls=False):
    '''
    Closes file

    Parameters
    ----------
    f : file handle
        Open file handle such as codecs.StreamReaderWriter
    ixls : bool, optional
        Use xlrd if True, otherwise use openpyxl (default)

    Returns
    -------
    None

    '''
    if ixls:
        f.release_resources()
    else:
        f.close()
    return


def _determine_indices(f, head, nres,
                       nc=0, cname=None,
                       snc=0, sname=None,
                       skip=0, cskip=0, hskip=0,
                       hstrip=True, sep=None,
                       ixls=False):
    '''
    Determine the indices to be read from lines as floats and as strings

    Parameters
    ----------
    f : file handle
        Open file handle will be closed if error
    head : list
        List with header lines read with _read_head.
    nres : int
        Number of columns in first non-header line
    nc : int or iterable, optional
        Number of columns to be read as floats [default: all (*nc<=0*)]. *nc*
        can be an int or a vector of column indexes, starting with 0. If
        *snc!=0*, then *nc* must be iterable, or -1 to read all other columns
        as floats.
    cname : iterable of str, optional
        Columns can be chosen by the values in the first header line;
        must be an iterable with strings.
    snc : int or iterable, optional
        Number of columns to be read as strings [default: none (*snc=0*)].
        *snc* can be an int or a vector of column indexes, starting with 0. If
        *nc!=0*, then *snc* must be an iterable, or -1 to read all other
        columns as strings.
    sname : iterable of str, optional
        Columns can be chosen by the values in the first header line;
        must be iterable with strings.
    skip : int, optional
        Number of lines to skip at the beginning of file (default: 0)
    cskip : int, optional
        Number of columns to skip at the beginning of each line (default: 0)
    hskip : int, optional
        Number of lines in skip that do not belong to header (default: 0)
    hstrip : bool, optional
        Strip header cells to match *cname* if True (default), else
        take the header cells literally.
    sep : str, optional
        Column separator. Whitespace is used if not given.
    ixls : bool, optional
        Use xlrd if True, otherwise use openpyxl (default)

    Returns
    -------
    list, list
        list of indices (int) to be read as floats,
        list of indices (int) to be read as strings

    '''
    # Determine indices
    if nc != 0 and cname is not None:
        _close_file(f, ixls=ixls)
        raise ValueError('nc and cname are mutually exclusive.')
    if snc != 0 and sname is not None:
        _close_file(f, ixls=ixls)
        raise ValueError('snc and sname are mutually exclusive.')
    # cname or sname
    if (cname is not None) or (sname is not None):
        # from first header line
        if (skip - hskip) <= 0:
            _close_file(f, ixls=ixls)
            raise ValueError('No header line left for choosing'
                             ' columns by name.')
        if isinstance(head[0], (tuple, list)):
            # from _xread_head
            hres = head[0]
        else:
            # from _read_head
            hres = head[0].split(sep)
        if hstrip:
            hres = [ h.strip() for h in hres ]
    if cname is not None:
        if not isinstance(cname, (list, tuple, np.ndarray)):
            cname = [cname]
        if hstrip:
            cname = [ h.strip() for h in cname ]
        nc = []
        for k in range(len(hres)):
            if hres[k] in cname:
                nc.append(k)
    if sname is not None:
        if not isinstance(sname, (list, tuple, np.ndarray)):
            sname = [sname]
        if hstrip:
            sname = [ h.strip() for h in sname ]
        snc = []
        for k in range(len(hres)):
            if hres[k] in sname:
                snc.append(k)
    if ( isinstance(nc, (list, tuple, np.ndarray)) and
         isinstance(snc, (list, tuple, np.ndarray)) ):
        # both indices
        if np.in1d(nc, snc, assume_unique=True).any():
            _close_file(f, ixls=ixls)
            raise ValueError('float and string indices overlap.')
        iinc  = nc
        iisnc = snc
    elif isinstance(nc, (list, tuple, np.ndarray)):
        # float indices
        iinc   = nc
        iirest = list(range(nres))
        for ii in iinc[::-1]:
            del iirest[ii]
        if snc <= -1:
            iisnc = iirest
        else:
            iisnc = iirest[:snc]
    elif isinstance(snc, (list, tuple, np.ndarray)):
        # string indices
        iisnc  = snc
        iirest = list(range(nres))
        for ii in iisnc[::-1]:
            del iirest[ii]
        if nc <= -1:
            iinc = iirest
        else:
            iinc = iirest[:nc]
    else:
        # no indices
        # cannot be nc=-1 and snc=-1
        if nc <= -1:
            if snc:
                iisnc = list(range(snc))
                iinc  = list(range(snc, nres))
            else:
                iisnc = []
                iinc  = list(range(cskip, nres))
        else:
            if snc <= -1:
                if nc:
                    iinc  = list(range(nc))
                    iisnc = list(range(nc, nres))
                else:
                    iinc  = []
                    iisnc = list(range(cskip, nres))
            else:
                # red snc first then nc
                iisnc = list(range(cskip, cskip + snc))
                iinc  = list(range(cskip + snc, cskip + snc + nc))
    return iinc, iisnc


def _line2var(res, var, iinc, strip=None):
    '''
    Append output list with selected elements from input list

    Parameters
    ----------
    res : list
        Line split by separator
    var : list
        List to append selected elements of *res*
    iinc : int, optional
        Indices in *res* to select

    Returns
    -------
    list
        *var* append by selected elements of *res*

    '''
    # Helper for append var with current line already splitted into list
    nres = len(res)
    if strip is None:
        tmp = [ res[i].strip('"').strip("'") for i in iinc if i < nres ]
    elif not strip:
        tmp = [ res[i] for i in iinc if i < nres ]
    else:
        tmp = [ res[i].strip(strip) for i in iinc if i < nres ]
    rest = len([ i for i in iinc if i >= nres ])
    if rest > 0:
        tmp.extend([''] * rest)
    var.append(tmp)
    return var


def _get_header(f, head, sep, iinc, iisnc,
                squeeze=False,
                fill=False, fill_value='NaN', sfill_value='',
                strip=None, full_header=False,
                transpose=False, strarr=False,
                ixls=False):
    '''
    Return header for float and string arrays

    Parameters
    ----------
    f : file handle
        Open file handle will be closed if error
    head : list
        List of input header files
    sep : str
        Column separator.
    iinc : list
        List of column indices for float array
    iisnc : list
        List of column indices for string array
    squeeze : bool, optional
        If set to *True*, the 2-dim array will be cleaned of degenerated
        dimension, possibly resulting in a vector, otherwise output is always
        2-dimensional.
    fill : bool, optional
        Fills in `fill_value` if True and not enough columns in input line,
        otherwise raises ValueError (default).
    fill_value : float, optional
         Value to fill in float array in empty cells or if not enough columns
         in line and *fill==True* (default: 'NaN').
    sfill_value : str, optional
         Value to fill in string array in empty cells or if not enough columns
         in line and *fill==True* (default: '').
    strip : str, optional
        Strip strings with *str.strip(strip)*. If *strip* is *None*, quotes "
        and ' are stripped from input fields (default), otherwise the character
        in *strip* is stripped from the input fields.
        If *strip* is set to *False* then nothing is stripped and reading is
        about 30% faster.
    full_header : bool, optional
        Header will be a list of the header lines if set.
    transpose : bool, optional
        `fsread` reads in row-major format, i.e. the first dimension are the
        rows and second dimension are the columns *out(:nrow, :ncol)*. This
        will be transposed to column-major format *out(:ncol, :nrow)* if
        *transpose* is set.
    strarr : bool, optional
        Return header as numpy array rather than list.
    ixls : bool, optional
        Use xlrd if True, otherwise use openpyxl (default)

    Returns
    -------
    list or array, list or array
        Header of float columns, header of string columns

    '''
    var  = list()
    svar = list()
    nhead = len(head)
    if nhead == 0:
        return var, svar
    if full_header:
        var = head
        if strarr:
            var = np.array(var, dtype=str)
        return var, svar
    else:
        k = 0
        while k < nhead:
            if isinstance(head[k], (tuple, list)):
                # from _xread_head
                hres = head[k]
            else:
                # from _read_head
                hres = head[k].split(sep)
            nhres = len(hres)
            miianc = -1
            if iinc:
                miianc = max(miianc, max(iinc))
            if iisnc:
                miianc = max(miianc, max(iisnc))
            if (miianc >= nhres) and (not fill):
                _close_file(f, ixls=ixls)
                raise ValueError(f'Line has not enough columns to index:'
                                 f' {head[k]}')
            if iinc:
                null = _line2var(hres, var, iinc, strip)
                var[-1] = [ 'NaN' if iv == 'NA' else iv for iv in var[-1] ]
            if iisnc:
                null = _line2var(hres, svar, iisnc,
                                 False if strip is None else strip)
            k += 1
    if strarr:
        if var:
            var = np.array(var, dtype=str)
            if fill:
                var = np.where(var == '', fill_value, var)
            if squeeze:
                var = var.squeeze()
            if transpose:
                var = var.T
        if svar:
            svar = np.array(svar, dtype=str)
            if fill:
                svar = np.where(svar == '', sfill_value, svar)
            if squeeze:
                svar = svar.squeeze()
            if transpose:
                svar = svar.T
    else:
        if var:
            if fill:
                var = [ [ fill_value if i == '' else i for i in row ]
                        for row in var ]
            if squeeze:
                if len(var) == 1:
                    var = var[0]
                else:
                    maxi = max([ len(i) for i in var ])
                    if maxi == 1:
                        var = [ i[0] for i in var ]
            if transpose and isinstance(var[0], list):
                var = [ list(i) for i in zip(*var) ]  # transpose
        if svar:
            if fill:
                svar = [ [ sfill_value if i == '' else i for i in row ]
                         for row in svar ]
            if squeeze:
                if len(svar) == 1:
                    svar = svar[0]
                else:
                    maxi = max([ len(i) for i in svar ])
                    if maxi == 1:
                        svar = [ i[0] for i in svar ]
            if transpose and isinstance(svar[0], list):
                svar = [list(i) for i in zip(*svar)]  # transpose

    return var, svar


def _get_separator(f, separator=None, skip_blank=False, comment=None):
    '''
    Return the *skip-hskip* lines after the first *hskip* lines as header

    Parameters
    ----------
    f : file handle
        Open file handle such as codecs.StreamReaderWriter
    separator : str, optional
        Column separator. If not given, columns separators are (in order):
        comma (','), semicolon (';'), whitespace.
    comment : iterable, optional
         Line gets excluded if the first non-white character is in comment
         sequence. Sequence must be iterable such as string, list and tuple,
         such as '#' or ['#', '!'].
    skip_blank : bool, optional
        Continues reading after a blank line if True, else stops reading
        at the first blank line (default).

    Returns
    -------
    str, list
        Separator, split first line after header split with separator

    '''
    split = -1
    while True:
        s = f.readline().rstrip('\r\n')
        if len(s) == 0:
            if skip_blank:
                continue
            else:
                break
        if comment is not None:
            if (s[0] in comment):
                continue
        break
    if separator is None:
        sep = ','
        res = s.split(sep)
        nres = len(res)
        if nres == 1:
            sep = ';'
            res = s.split(sep)
            nres = len(res)
            if nres == 1:
                sep = None
                res = s.split(sep)
    else:
        sep = separator
        res = s.split(sep)
    return sep, res


# --------------------------------------------------------------------


[docs]def fsread(infile, nc=0, cname=None, snc=0, sname=None, skip=0, cskip=0, hskip=0, separator=None, squeeze=False, skip_blank=False, comment=None, fill=False, fill_value=np.nan, sfill_value='', strip=None, hstrip=True, encoding='ascii', errors='ignore', header=False, full_header=False, transpose=False, strarr=False, return_list=False): """ Read numbers and strings from a file into 2D float and string arrays Columns can be picked specifically by index or name. The header can be read separately with the (almost) same call as reading the numbers or string. Parameters ---------- infile : str Source file name nc : int or iterable, optional Number of columns to be read as floats [default: none (*nc=0*)]. *nc* can be an int or a vector of column indexes, starting with 0. If *snc!=0*, then *nc* must be iterable, or -1 to read all other columns as floats. If both *nc* and *snc* are int, then first *snc* string columns will be read and then *nc* float columns will be read. cname : iterable of str, optional Columns for floats can be chosen by the values in the first header line; must be an iterable with strings. snc : int or iterable, optional Number of columns to be read as strings [default: none (*snc=0*)]. *snc* can be an int or a vector of column indexes, starting with 0. If *nc!=0*, then *snc* must be iterable, or -1 to read all other columns as strings. If both *nc* and *snc* are int, then first *snc* string columns will be read and then *nc* float columns will be read. sname : iterable of str, optional Columns for strings can be chosen by the values in the first header line; must be an iterable with strings. skip : int, optional Number of lines to skip at the beginning of file (default: 0) cskip : int, optional Number of columns to skip at the beginning of each line (default: 0) hskip : int, optional Number of lines in skip that do not belong to header (default: 0) separator : str, optional Column separator. If not given, columns separators are (in order): comma (','), semicolon (';'), whitespace. squeeze : bool, optional If set to *True*, the 2-dim array will be cleaned of degenerated dimension, possibly resulting in a vector, otherwise output is always 2-dimensional. skip_blank : bool, optional Continues reading after a blank line if True, else stops reading at the first blank line (default). comment : iterable, optional Line gets excluded if the first character is in comment sequence. Sequence must be iterable such as string, list and tuple, .e.g '#' or ['#', '!']. fill : bool, optional Fills in `fill_value` if True and not enough columns in input line, else raises ValueError (default). fill_value : float, optional Value to fill in float array in empty cells or if not enough columns in line and *fill==True* (default: numpy.nan). sfill_value : str, optional Value to fill in string array in empty cells or if not enough columns in line and *fill==True* (default: ''). strip : str, optional Strip float columns with *str.strip(strip)*. If *strip* is *None*, quotes " and ' are stripped from input fields (default), otherwise the character in *strip* is stripped from the input fields. *strip* has to be set explicitly to also strip string columns. If *strip* is set to *False* then nothing is stripped and reading is about 30% faster. hstrip : bool, optional Strip header cells to match *cname* if True (default), else take header cells literally. encoding : str, optional Specifies the encoding which is to be used for the file (default: 'ascii'). Any encoding that encodes to and decodes from bytes is allowed. errors : str, optional Errors may be given to define the error handling during encoding of the file. Possible values are 'strict', 'replace', and 'ignore' (default). header : bool, optional Return header strings instead of numbers/strings in rest of file. This allows to use (almost) the same call to get values and header: .. code-block:: python head, shead = fsread(ifile, nc=1, snc=1, header=True) data, sdata = fsread(ifile, nc=1, snc=1) temp = data[:, head[0].index('temp')] full_header : bool, optional Header will be a list of the header lines if set. transpose : bool, optional `fsread` reads in row-major format, i.e. the first dimension are the rows and second dimension are the columns *out(:nrow, :ncol)*. This will be transposed to column-major format *out(:ncol, :nrow)* if *transpose* is set. strarr : bool, optional Return header as numpy array rather than list. return_list : bool, optional Return lists rather than arrays. Returns ------- array of floats, array of strings First array is also string if header. Array is replaced by an empty string if this output is not demanded such as with *nc=0*. Notes ----- If *header==True* then skip is counterintuitive because it is actually the number of header rows to be read. This is to be able to have the exact same call of the function, once with *header=False* and once with *header=True*. Blank lines are not filled but are taken as end of file if *fill=True*. Examples -------- Create some data >>> filename = 'test.dat' >>> with open(filename,'w') as ff: ... print('head1 head2 head3 head4', file=ff) ... print('1.1 1.2 1.3 1.4', file=ff) ... print('2.1 2.2 2.3 2.4', file=ff) Read sample with fread - see fread for more examples >>> a, sa = fsread(filename, nc=[1,3], skip=1) >>> print(a) [[1.2 1.4] [2.2 2.4]] >>> print(sa) [] >>> a, sa = fsread(filename, nc=2, skip=1, header=True) >>> print(a) [['head1', 'head2']] >>> print(sa) [] Read sample with sread - see sread for more examples >>> a, sa = fsread(filename, snc=[1,3], skip=1) >>> print(a) [] >>> print(sa) [['1.2' '1.4'] ['2.2' '2.4']] Create some mixed data >>> with open(filename,'w') as ff: ... print('head1 head2 head3 head4', file=ff) ... print('01.12.2012 1.2 name1 1.4', file=ff) ... print('01.01.2013 2.2 name2 2.4', file=ff) Read float and string columns in different ways >>> a, sa = fsread(filename, nc=[1,3], skip=1) >>> print(a) [[1.2 1.4] [2.2 2.4]] >>> print(sa) [] >>> a, sa = fsread(filename, nc=[1,3], snc=[0,2], skip=1) >>> print(a) [[1.2 1.4] [2.2 2.4]] >>> print(sa) [['01.12.2012' 'name1'] ['01.01.2013' 'name2']] >>> a, sa = fsread(filename, nc=[1,3], snc=-1, skip=1) >>> print(sa) [['01.12.2012' 'name1'] ['01.01.2013' 'name2']] >>> a, sa = fsread(filename, nc=-1, snc=[0,2], skip=1) >>> print(a) [[1.2 1.4] [2.2 2.4]] >>> a, sa = fsread(filename, nc=[1,3], snc=-1, skip=1, return_list=True) >>> print(a) [[1.2, 1.4], [2.2, 2.4]] >>> print(sa) [['01.12.2012', 'name1'], ['01.01.2013', 'name2']] Read header >>> a, sa = fsread(filename, nc=[1,3], snc=[0,2], skip=1, header=True) >>> print(a) [['head2', 'head4']] >>> print(sa) [['head1', 'head3']] >>> a, sa = fsread(filename, nc=[1,3], snc=[0,2], skip=1, header=True, ... squeeze=True) >>> print(a) ['head2', 'head4'] >>> print(sa) ['head1', 'head3'] Create some mixed data with missing values >>> with open(filename,'w') as ff: ... print('head1,head2,head3,head4', file=ff) ... print('01.12.2012,1.2,name1,1.4', file=ff) ... print('01.01.2013,,name2,2.4', file=ff) >>> a, sa = fsread(filename, nc=[1,3], skip=1, fill=True, fill_value=-1) >>> print(a) [[ 1.2 1.4] [-1. 2.4]] >>> print(sa) [] >>> a, sa = fsread(filename, nc=[1,3], skip=1, fill=True, fill_value=-1, ... strarr=True) >>> print(a) [[ 1.2 1.4] [-1. 2.4]] >>> print(sa) [] Read data using column names >>> a, sa = fsread(filename, cname='head2', snc=[0,2], skip=1, fill=True, ... fill_value=-1, squeeze=True) >>> print(a) [ 1.2 -1. ] >>> print(sa) [['01.12.2012' 'name1'] ['01.01.2013' 'name2']] >>> a, sa = fsread(filename, cname=['head2','head4'], snc=-1, skip=1, ... fill=True, fill_value=-1) >>> print(a) [[ 1.2 1.4] [-1. 2.4]] >>> print(sa) [['01.12.2012' 'name1'] ['01.01.2013' 'name2']] >>> # header >>> a, sa = fsread(filename, nc=[1,3], sname=['head1','head3'], skip=1, ... fill=True, fill_value=-1, strarr=True, header=True) >>> print(a) [['head2' 'head4']] >>> print(sa) [['head1' 'head3']] >>> a, sa = fsread(filename, cname=['head2','head4'], snc=-1, skip=1, ... header=True, full_header=True) >>> print(a) ['head1,head2,head3,head4'] >>> print(sa) [] >>> a, sa = fsread(filename, cname=['head2','head4'], snc=-1, skip=1, ... fill=True, fill_value=-1, header=True, full_header=True) >>> print(a) ['head1,head2,head3,head4'] >>> print(sa) [] >>> a, sa = fsread(filename, cname=[' head2','head4'], snc=-1, skip=1, ... fill=True, fill_value=-1, hstrip=False) >>> print(a) [[1.4] [2.4]] >>> print(sa) [['01.12.2012' '1.2' 'name1'] ['01.01.2013' '' 'name2']] Clean up doctest >>> import os >>> os.remove(filename) """ # Input error if isinstance(nc, int) and isinstance(snc, int): if (nc <= -1) and (snc <= -1): raise ValueError('nc and snc must be integer or list of indices;' ' < 0 means to read the rest of the columns.' ' nc and snc cannot both be < 0.') # Open file f = codecs.open(infile, 'r', encoding=encoding, errors=errors) # Read header and skip lines head = _read_head(f, skip, hskip) # Read first line to determine ncolumns and separator (if not set) sep, res = _get_separator(f, separator, skip_blank, comment) nres = len(res) if not nres: f.close() raise ValueError('No line to determine separator.') # Determine indices iinc, iisnc = _determine_indices(f, head, nres, nc=nc, cname=cname, snc=snc, sname=sname, skip=skip, cskip=cskip, hskip=hskip, hstrip=hstrip, sep=sep) aiinc = list(iinc) aiinc.extend(iisnc) miianc = max(aiinc) # Header if np.isfinite(fill_value): fval = str(fill_value) else: fval = 'NaN' if header: var, svar = _get_header( f, head, sep, iinc, iisnc, squeeze=squeeze, fill=fill, fill_value=fval, sfill_value=sfill_value, strip=strip, full_header=full_header, transpose=transpose, strarr=strarr) f.close() return var, svar # Values - first line if (miianc >= nres) and (not fill): f.close() if sep is None: sres = ' '.join(res) else: sres = sep.join(res) raise ValueError('Line has not enough columns to index: ' + sres) var = list() svar = list() if iinc: null = _line2var(res, var, iinc, strip) var[-1] = [ 'NaN' if iv == 'NA' else iv for iv in var[-1] ] if iisnc: null = _line2var(res, svar, iisnc, False if strip is None else strip) # Values - rest of file for line in f: s = str(line.rstrip('\r\n')) if len(s) == 0: if skip_blank: continue else: break if comment is not None: if (s[0] in comment): continue res = s.split(sep) nres = len(res) if (miianc >= nres) and (not fill): f.close() raise ValueError('Line has not enough columns to index: ' + s) if iinc: null = _line2var(res, var, iinc, strip) var[-1] = [ 'NaN' if iv == 'NA' else iv for iv in var[-1] ] if iisnc: null = _line2var(res, svar, iisnc, False if strip is None else strip) f.close() # Return correct shape and type if var: var = np.array(var, dtype=str) if fill: var = np.where(var == '', fval, var) var = np.array(var, dtype=float) if squeeze: var = var.squeeze() if transpose: var = var.T if return_list: if var.ndim == 1: var = [ i for i in var ] else: var = [ [ var[i, j] for j in range(var.shape[1]) ] for i in range(var.shape[0]) ] if svar: svar = np.array(svar, dtype=str) if fill: svar = np.where(svar == '', sfill_value, svar) if squeeze: svar = svar.squeeze() if transpose: svar = svar.T if return_list: if svar.ndim == 1: svar = [ i for i in svar ] else: svar = [ [ svar[i, j] for j in range(svar.shape[1]) ] for i in range(svar.shape[0]) ] return var, svar
# --------------------------------------------------------------------
[docs]def fread(infile, nc=0, cname=None, snc=0, sname=None, **kwargs): """ Read floats from a file into 2D float array Columns can be picked specifically by index or name. The header can be read separately with the (almost) same call as reading the floats. Parameters ---------- infile : str Source file name nc : int or iterable, optional Number of columns to be read as floats [default: all (*nc=0*)]. *nc* can be an int or a vector of column indexes, starting with 0. *nc<=0* reads all columns. cname : iterable of str, optional Columns for floats can be chosen by the values in the first header line; must be an iterable with strings. snc : int or iterable, optional Not used in fread; will be silently ignored. sname : iterable of str, optional Not used in fread; will be silently ignored. **kwargs : dict, optional All other keywords will be passed to `fsread`. Returns ------- array of floats Array of numbers in file, or header. Notes ----- If *header==True* then skip is counterintuitive because it is actually the number of header rows to be read. This is to be able to have the exact same call of the function, once with *header=False* and once with *header=True*. Blank lines are not filled but are taken as end of file if *fill=True*. Examples -------- Create some data >>> filename = 'test.dat' >>> with open(filename,'w') as ff: ... print('head1 head2 head3 head4', file=ff) ... print('1.1 1.2 1.3 1.4', file=ff) ... print('2.1 2.2 2.3 2.4', file=ff) Read sample file in different ways >>> # data >>> print(fread(filename, skip=1)) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4]] >>> print(fread(filename, skip=2)) [[2.1 2.2 2.3 2.4]] >>> print(fread(filename, skip=1, cskip=1)) [[1.2 1.3 1.4] [2.2 2.3 2.4]] >>> print(fread(filename, nc=2, skip=1, cskip=1)) [[1.2 1.3] [2.2 2.3]] >>> print(fread(filename, nc=[1,3], skip=1)) [[1.2 1.4] [2.2 2.4]] >>> print(fread(filename, nc=1, skip=1)) [[1.1] [2.1]] >>> print(fread(filename, nc=1, skip=1, squeeze=True)) [1.1 2.1] >>> # header >>> print(fread(filename, nc=2, skip=1, header=True)) [['head1', 'head2']] >>> print(fread(filename, nc=2, skip=1, header=True, full_header=True)) ['head1 head2 head3 head4'] >>> print(fread(filename, nc=1, skip=2, header=True)) [['head1'], ['1.1']] >>> print(fread(filename, nc=1, skip=2, header=True, squeeze=True)) ['head1', '1.1'] >>> print(fread(filename, nc=1, skip=2, header=True, strarr=True)) [['head1'] ['1.1']] Create data with blank lines >>> with open(filename, 'a') as ff: ... print('', file=ff) ... print('3.1 3.2 3.3 3.4', file=ff) >>> print(fread(filename, skip=1)) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4]] >>> print(fread(filename, skip=1, skip_blank=True, comment='#!')) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4] [3.1 3.2 3.3 3.4]] Create data with comment lines >>> with open(filename, 'a') as ff: ... print('# First comment', file=ff) ... print('! Second 2 comment', file=ff) ... print('4.1 4.2 4.3 4.4', file=ff) >>> print(fread(filename, skip=1)) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4]] >>> print(fread(filename, skip=1, nc=[2], skip_blank=True, comment='#')) [[1.3] [2.3] [3.3] [2. ] [4.3]] >>> print(fread(filename, skip=1, skip_blank=True, comment='#!')) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4] [3.1 3.2 3.3 3.4] [4.1 4.2 4.3 4.4]] >>> print(fread(filename, skip=1, skip_blank=True, comment=('#','!'))) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4] [3.1 3.2 3.3 3.4] [4.1 4.2 4.3 4.4]] >>> print(fread(filename, skip=1, skip_blank=True, comment=['#','!'])) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4] [3.1 3.2 3.3 3.4] [4.1 4.2 4.3 4.4]] Add a line with fewer columns >>> with open(filename, 'a') as ff: ... print('5.1 5.2', file=ff) >>> print(fread(filename, skip=1)) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4]] >>> print(fread(filename, skip=1, skip_blank=True, comment='#!', ... fill=True, fill_value=-1)) [[ 1.1 1.2 1.3 1.4] [ 2.1 2.2 2.3 2.4] [ 3.1 3.2 3.3 3.4] [ 4.1 4.2 4.3 4.4] [ 5.1 5.2 -1. -1. ]] >>> # transpose >>> print(fread(filename, skip=1)) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4]] >>> print(fread(filename, skip=1, transpose=True)) [[1.1 2.1] [1.2 2.2] [1.3 2.3] [1.4 2.4]] Create some more data with Nan and Inf >>> filename1 = 'test1.dat' >>> with open(filename1,'w') as ff: ... print('head1 head2 head3 head4', file=ff) ... print('1.1 1.2 1.3 1.4', file=ff) ... print('2.1 nan Inf "NaN"', file=ff) Treat Nan and Inf with automatic strip of " and ' >>> print(fread(filename1, skip=1, transpose=True)) [[1.1 2.1] [1.2 nan] [1.3 inf] [1.4 nan]] Create some more data with escaped numbers >>> filename2 = 'test2.dat' >>> with open(filename2,'w') as ff: ... print('head1 head2 head3 head4', file=ff) ... print('"1.1" "1.2" "1.3" "1.4"', file=ff) ... print('2.1 nan Inf "NaN"', file=ff) Strip >>> print(fread(filename2, skip=1, transpose=True, strip='"')) [[1.1 2.1] [1.2 nan] [1.3 inf] [1.4 nan]] Create more data with an extra (shorter) header line >>> filename3 = 'test3.dat' >>> with open(filename3,'w') as ff: ... print('Extra header', file=ff) ... print('head1 head2 head3 head4', file=ff) ... print('1.1 1.2 1.3 1.4', file=ff) ... print('2.1 2.2 2.3 2.4', file=ff) >>> print(fread(filename3, skip=2, hskip=1)) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4]] >>> print(fread(filename3, nc=2, skip=2, hskip=1, header=True)) [['head1', 'head2']] >>> # cname >>> print(fread(filename, cname='head2', skip=1, skip_blank=True, ... comment='#!', squeeze=True)) [1.2 2.2 3.2 4.2 5.2] >>> print(fread(filename, cname=['head1','head2'], skip=1, ... skip_blank=True, comment='#!')) [[1.1 1.2] [2.1 2.2] [3.1 3.2] [4.1 4.2] [5.1 5.2]] >>> print(fread(filename, cname=['head1','head2'], skip=1, skip_blank=True, ... comment='#!', header=True)) [['head1', 'head2']] >>> print(fread(filename, cname=['head1','head2'], skip=1, skip_blank=True, ... comment='#!', header=True, full_header=True)) ['head1 head2 head3 head4'] >>> print(fread(filename, cname=[' head1','head2'], skip=1, ... skip_blank=True, comment='#!', hstrip=False)) [[1.2] [2.2] [3.2] [4.2] [5.2]] Clean up doctest >>> import os >>> os.remove(filename) >>> os.remove(filename1) >>> os.remove(filename2) >>> os.remove(filename3) """ # nc=0 in fread and sread reads all columns if (nc == 0) and (cname is None): nc = -1 dat, sdat = fsread(infile, nc=nc, cname=cname, snc=0, sname=None, **kwargs) return dat
# --------------------------------------------------------------------
[docs]def sread(infile, nc=0, cname=None, snc=0, sname=None, fill_value='', sfill_value='', header=False, full_header=False, **kwargs): """ Read strings from a file into 2D string array Columns can be picked specifically by index or name. The header can be read separately with the (almost) same call as reading the strings. Parameters ---------- infile : str Source file name nc : int or iterable, optional Number of columns to be read as strings [default: all (*nc=0*)]. *nc* can be an int or a vector of column indexes, starting with 0. *nc<=0* reads all columns. *snc* takes precedence if *nc* and *snc* are set. cname : iterable of str, optional Columns for floats can be chosen by the values in the first header line; must be an iterable with strings. *sname* takes precedence if *cname* and *sname* are set. snc : int or iterable, optional Number of columns to be read as strings [default: all (*snc=0*)]. *snc* can be an int or a vector of column indexes, starting with 0. *snc<=0* reads all columns. *snc* takes precedence if *nc* and *snc* are set. sname : iterable of str, optional Columns for strings can be chosen by the values in the first header line; must be an iterable with strings. *sname* takes precedence if *cname* and *sname* are set. fill_value : str, optional Value to fill in string array in empty cells or if not enough columns in line and *fill==True* (default: ''). *sfill_value* takes precedence if *fill_value* and *sfill_value* are set. sfill_value : str, optional Value to fill in string array in empty cells or if not enough columns in line and *fill==True* (default: ''). *sfill_value* takes precedence if *fill_value* and *sfill_value* are set. fill_value : float, optional value to fill in array in empty cells or if not enough columns in line and `fill==True` (default: ''). header : bool, optional Return header strings instead of strings in rest of file. This allows to use (almost) the same call to get values and header: .. code-block:: python shead = sread(ifile, nc=2, header=True) sdata = sread(ifile, nc=2) date = sdata[:, head[0].index('Datetime')] full_header : bool, optional Header will be a list of the header lines if set. **kwargs : dict, optional All other keywords will be passed to `fsread`. Returns ------- array of strings Array of strings in file, or of header. Notes ----- If *header==True* then skip is counterintuitive because it is actually the number of header rows to be read. This is to be able to have the exact same call of the function, once with *header=False* and once with *header=True*. Blank lines are not filled but are taken as end of file if *fill=True*. Examples -------- Create some data >>> filename = 'test.dat' >>> with open(filename,'w') as ff: ... print('head1 head2 head3 head4', file=ff) ... print('1.1 1.2 1.3 1.4', file=ff) ... print('2.1 2.2 2.3 2.4', file=ff) Read sample file in different ways >>> # data >>> print(sread(filename, skip=1)) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4']] >>> print(sread(filename, skip=2, return_list=True)) [['2.1', '2.2', '2.3', '2.4']] >>> print(sread(filename, skip=2)) [['2.1' '2.2' '2.3' '2.4']] >>> print(sread(filename, skip=1, cskip=1)) [['1.2' '1.3' '1.4'] ['2.2' '2.3' '2.4']] >>> print(sread(filename, nc=2, skip=1, cskip=1)) [['1.2' '1.3'] ['2.2' '2.3']] >>> print(sread(filename, nc=[1,3], skip=1)) [['1.2' '1.4'] ['2.2' '2.4']] >>> print(sread(filename, nc=1, skip=1)) [['1.1'] ['2.1']] >>> print(sread(filename, nc=1, skip=1, squeeze=True)) ['1.1' '2.1'] >>> # header >>> print(sread(filename, nc=2, skip=1, header=True)) [['head1', 'head2']] >>> print(sread(filename, nc=2, skip=1, header=True, full_header=True)) ['head1 head2 head3 head4'] >>> print(sread(filename, nc=1, skip=2, header=True)) [['head1'], ['1.1']] >>> print(sread(filename, nc=1, skip=2, header=True, squeeze=True)) ['head1', '1.1'] >>> print(sread(filename, nc=1, skip=2, header=True, squeeze=True, ... strarr=True)) ['head1' '1.1'] >>> print(sread(filename, nc=1, skip=2, header=True, squeeze=True, ... transpose=True)) ['head1', '1.1'] Data with blank lines >>> with open(filename, 'a') as ff: ... print('', file=ff) ... print('3.1 3.2 3.3 3.4', file=ff) >>> print(sread(filename, skip=1)) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4']] >>> print(sread(filename, skip=1, skip_blank=True)) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4'] ['3.1' '3.2' '3.3' '3.4']] >>> print(sread(filename, skip=1)) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4']] >>> print(sread(filename, skip=1, transpose=True)) [['1.1' '2.1'] ['1.2' '2.2'] ['1.3' '2.3'] ['1.4' '2.4']] >>> print(sread(filename, skip=1, transpose=True)) [['1.1' '2.1'] ['1.2' '2.2'] ['1.3' '2.3'] ['1.4' '2.4']] Data with comment lines >>> with open(filename, 'a') as ff: ... print('# First comment', file=ff) ... print('! Second second comment', file=ff) ... print('4.1 4.2 4.3 4.4', file=ff) >>> print(sread(filename, skip=1)) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4']] >>> print(sread(filename, skip=1, skip_blank=True, comment='#')) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4'] ['3.1' '3.2' '3.3' '3.4'] ['!' 'Second' 'second' 'comment'] ['4.1' '4.2' '4.3' '4.4']] >>> print(sread(filename, skip=1, skip_blank=True, comment='#!')) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4'] ['3.1' '3.2' '3.3' '3.4'] ['4.1' '4.2' '4.3' '4.4']] >>> print(sread(filename, skip=1, skip_blank=True, comment=('#','!'))) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4'] ['3.1' '3.2' '3.3' '3.4'] ['4.1' '4.2' '4.3' '4.4']] >>> print(sread(filename, skip=1, skip_blank=True, comment=['#','!'])) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4'] ['3.1' '3.2' '3.3' '3.4'] ['4.1' '4.2' '4.3' '4.4']] Data with escaped numbers >>> filename2 = 'test2.dat' >>> with open(filename2,'w') as ff: ... print('"head1" "head2" "head3" "head4"', file=ff) ... print('"1.1" "1.2" "1.3" "1.4"', file=ff) ... print('2.1 nan Inf "NaN"', file=ff) >>> print(sread(filename2, skip=1, transpose=True, strip='"')) [['1.1' '2.1'] ['1.2' 'nan'] ['1.3' 'Inf'] ['1.4' 'NaN']] Data with an extra (shorter) header line >>> filename3 = 'test3.dat' >>> with open(filename3,'w') as ff: ... print('Extra header', file=ff) ... print('head1 head2 head3 head4', file=ff) ... print('1.1 1.2 1.3 1.4', file=ff) ... print('2.1 2.2 2.3 2.4', file=ff) >>> print(sread(filename3, skip=2, return_list=True)) [['1.1', '1.2', '1.3', '1.4'], ['2.1', '2.2', '2.3', '2.4']] >>> print(sread(filename3, skip=2, hskip=1)) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '2.2' '2.3' '2.4']] >>> print(sread(filename3, nc=2, skip=2, hskip=1, header=True)) [['head1', 'head2']] Data with missing values >>> filename4 = 'test4.dat' >>> with open(filename4,'w') as ff: ... print('Extra header', file=ff) ... print('head1,head2,head3,head4', file=ff) ... print('1.1,1.2,1.3,1.4', file=ff) ... print('2.1,,2.3,2.4', file=ff) >>> print(sread(filename4, skip=2, return_list=True)) [['1.1', '1.2', '1.3', '1.4'], ['2.1', '', '2.3', '2.4']] >>> print(sread(filename4, skip=2, fill=True, fill_value='-1')) [['1.1' '1.2' '1.3' '1.4'] ['2.1' '-1' '2.3' '2.4']] >>> # cname >>> print(sread(filename, cname='head2', skip=1, skip_blank=True, ... comment='#!', squeeze=True)) ['1.2' '2.2' '3.2' '4.2'] >>> print(sread(filename, cname=['head1','head2'], skip=1, skip_blank=True, ... comment='#!')) [['1.1' '1.2'] ['2.1' '2.2'] ['3.1' '3.2'] ['4.1' '4.2']] >>> print(sread(filename, cname=['head1','head2'], skip=1, skip_blank=True, ... comment='#!', header=True)) [['head1', 'head2']] >>> print(sread(filename, cname=['head1','head2'], skip=1, skip_blank=True, ... comment='#!', header=True, full_header=True)) ['head1 head2 head3 head4'] >>> print(sread(filename, cname=[' head1','head2'], skip=1, ... skip_blank=True, comment='#!', hstrip=False)) [['1.2'] ['2.2'] ['3.2'] ['4.2']] Clean up doctest >>> import os >>> os.remove(filename) >>> os.remove(filename2) >>> os.remove(filename3) >>> os.remove(filename4) """ # string keywords overwrite float keywords if snc != 0: nc = snc if sname is not None: cname = sname if sfill_value: fill_value = sfill_value # nc=0 in fread and sread reads all columns if (nc == 0) and (cname is None): nc = -1 dat, sdat = fsread(infile, nc=0, cname=None, snc=nc, sname=cname, fill_value=np.nan, sfill_value=fill_value, header=header, full_header=full_header, **kwargs) if header and full_header: return dat else: return sdat
[docs]def xread(infile, sheet=None, nc=0, cname=None, snc=0, sname=None, skip=0, cskip=0, hskip=0, squeeze=False, fill=False, fill_value=np.nan, sfill_value='', strip=None, hstrip=True, header=False, full_header=False, transpose=False, strarr=False, return_list=False): """ Read numbers and strings from Excel file into 2D float and string arrays This routine is analog to fsread but for Excel files. Columns can be picked specifically by index or name. The header can be read separately with the (almost) same call as reading the numbers or string. Parameters ---------- infile : str Excel source file name sheet : str or int, optional Name or number of Excel sheet (default: first sheet) nc : int or iterable, optional Number of columns to be read as floats [default: none (*nc=0*)]. *nc* can be an int or a vector of column indexes, starting with 0. If *snc!=0*, then *nc* must be iterable, or -1 to read all other columns as floats. If both *nc* and *snc* are int, then first *snc* string columns will be read and then *nc* float columns will be read. cname : iterable of str, optional Columns for floats can be chosen by the values in the first header line; must be an iterable with strings. snc : int or iterable, optional Number of columns to be read as strings [default: none (*snc=0*)]. *snc* can be an int or a vector of column indexes, starting with 0. If *nc!=0*, then *snc* must be iterable, or -1 to read all other columns as strings. If both *nc* and *snc* are int, then first *snc* string columns will be read and then *nc* float columns will be read. sname : iterable of str, optional Columns for strings can be chosen by the values in the first header line; must be an iterable with strings. skip : int, optional Number of lines to skip at the beginning of file (default: 0) cskip : int, optional Number of columns to skip at the beginning of each line (default: 0) hskip : int, optional Number of lines in skip that do not belong to header (default: 0) squeeze : bool, optional If set to *True*, the 2-dim array will be cleaned of degenerated dimension, possibly resulting in a vector, otherwise output is always 2-dimensional. fill : bool, optional Fills in `fill_value` if True and not enough columns in input line, else raises ValueError (default). fill_value : float, optional Value to fill in float array in empty cells or if not enough columns in line and *fill==True* (default: numpy.nan). sfill_value : str, optional Value to fill in string array in empty cells or if not enough columns in line and *fill==True* (default: ''). strip : str, optional Strip float columns with *str.strip(strip)*. If *strip* is *None*, quotes " and ' are stripped from input fields (default), otherwise the character in *strip* is stripped from the input fields. *strip* has to be set explicitly to also strip string columns. If *strip* is set to *False* then nothing is stripped and reading is about 30% faster for text files. hstrip : bool, optional Strip header cells to match *cname* if True (default), else take header cells literally. header : bool, optional Return header strings instead of numbers/strings in rest of file. This allows to use (almost) the same call to get values and header: .. code-block:: python head, shead = xread(ifile, nc=1, snc=1, header=True) data, sdata = xread(ifile, nc=1, snc=1) temp = data[:, head[0].index('temp')] full_header : bool, optional Header will be a list of the header lines if set. transpose : bool, optional `fsread` reads in row-major format, i.e. the first dimension are the rows and second dimension are the columns *out(:nrow, :ncol)*. This will be transposed to column-major format *out(:ncol, :nrow)* if *transpose* is set. strarr : bool, optional Return header as numpy array rather than list. return_list : bool, optional Return lists rather than arrays. Returns ------- array of floats, array of strings First array is also string if *header==True*. The array of floats or of strings is replaced by an empty list if the output is not demanded, e.g. the array of float is set to [] if *nc=0*. Notes ----- If *header==True* then skip is counterintuitive because it is actually the number of header rows to be read. This is to be able to have the exact same call of the function, once with *header=False* and once with *header=True*. ``xread`` needs module :mod:`xlrd` for reading xls-files, and module :mod:`openpyxl` for reading xlsx-files. Raises IOError during read if relevant module is not installed. Examples -------- Using xlrd for xls files >>> filename = 'test_readexcel.xls' >>> dat, sdat = xread(filename, skip=1, nc=-1) >>> print(dat) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4] [3.1 3.2 3.3 3.4] [4.1 4.2 4.3 4.4]] >>> print(sdat) [] >>> dat, sdat = xread(filename, skip=1, nc=[2], squeeze=True) >>> print(dat) [1.3 2.3 3.3 4.3] >>> dat, sdat = xread(filename, skip=1, cname=['head1', 'head2']) >>> print(dat) [[1.1 1.2] [2.1 2.2] [3.1 3.2] [4.1 4.2]] >>> dat, sdat = xread(filename, sheet='Sheet3', nc=[1], snc=[0, 2], skip=1, ... squeeze=True) >>> print(dat) [1.2 2.2 3.2 4.2] >>> print(sdat) [['name1' 'name5'] ['name2' 'name6'] ['name3' 'name7'] ['name4' 'name8']] >>> dat, sdat = xread(filename, sheet=2, cname='head2', snc=[0, 2], skip=1, ... squeeze=True) >>> print(dat) [1.2 2.2 3.2 4.2] >>> print(sdat) [['name1' 'name5'] ['name2' 'name6'] ['name3' 'name7'] ['name4' 'name8']] >>> dat, sdat = xread(filename, sheet='Sheet2', cname=['head2', 'head4'], ... snc=[0, 2], skip=1, fill=True, fill_value=-9, ... sfill_value='-8') >>> print(dat) [[-9. 1.4] [ 2.2 2.4] [ 3.2 3.4] [ 4.2 4.4]] >>> print(sdat) [['1.1' '1.3'] ['2.1' '2.3'] ['3.1' '-8'] ['4.1' '4.3']] >>> dat, sdat = xread(filename, sheet='Sheet2', cname=['head2', 'head4'], ... snc=[0, 2], skip=1, header=True) >>> print(dat) [['head2', 'head4']] >>> print(sdat) [['head1', 'head3']] >>> dat, sdat = xread(filename, sheet='Sheet2', cname=['head2', 'head4'], ... snc=[0, 2], skip=1, header=True, squeeze=True) >>> print(dat) ['head2', 'head4'] >>> print(sdat) ['head1', 'head3'] >>> dat, sdat = xread(filename, sheet='Sheet2', nc=-1, skip=1, header=True) >>> print(dat) [['head1', 'head2', 'head3', 'head4']] >>> dat, sdat = xread(filename, sheet='Sheet2', cname=[' head2', 'head4'], ... snc=[0, 2], skip=1, fill=True, fill_value=-9, ... sfill_value='-8', hstrip=False) >>> print(dat) [[1.4] [2.4] [3.4] [4.4]] Using openpyxl for xlsx files >>> filename = 'test_readexcel.xlsx' >>> dat, sdat = xread(filename, skip=1, nc=-1) >>> print(dat) [[1.1 1.2 1.3 1.4] [2.1 2.2 2.3 2.4] [3.1 3.2 3.3 3.4] [4.1 4.2 4.3 4.4]] >>> print(sdat) [] >>> dat, sdat = xread(filename, skip=1, nc=[2], squeeze=True) >>> print(dat) [1.3 2.3 3.3 4.3] >>> dat, sdat = xread(filename, skip=1, cname=['head1', 'head2']) >>> print(dat) [[1.1 1.2] [2.1 2.2] [3.1 3.2] [4.1 4.2]] >>> dat, sdat = xread(filename, sheet='Sheet3', nc=[1], snc=[0, 2], skip=1, ... squeeze=True) >>> print(dat) [1.2 2.2 3.2 4.2] >>> print(sdat) [['name1' 'name5'] ['name2' 'name6'] ['name3' 'name7'] ['name4' 'name8']] >>> dat, sdat = xread(filename, sheet=2, cname='head2', snc=[0, 2], skip=1, ... squeeze=True) >>> print(dat) [1.2 2.2 3.2 4.2] >>> print(sdat) [['name1' 'name5'] ['name2' 'name6'] ['name3' 'name7'] ['name4' 'name8']] >>> dat, sdat = xread(filename, sheet='Sheet2', cname=['head2', 'head4'], ... snc=[0, 2], skip=1, fill=True, fill_value=-9, ... sfill_value='-8') >>> print(dat) [[-9. 1.4] [ 2.2 2.4] [ 3.2 3.4] [ 4.2 4.4]] >>> print(sdat) [['1.1' '1.3'] ['2.1' '2.3'] ['3.1' '-8'] ['4.1' '4.3']] >>> dat, sdat = xread(filename, sheet='Sheet2', cname=['head2', 'head4'], ... snc=[0, 2], skip=1, header=True) >>> print(dat) [['head2', 'head4']] >>> print(sdat) [['head1', 'head3']] >>> dat, sdat = xread(filename, sheet='Sheet2', cname=['head2', 'head4'], ... snc=[0, 2], skip=1, header=True, squeeze=True) >>> print(dat) ['head2', 'head4'] >>> print(sdat) ['head1', 'head3'] >>> dat, sdat = xread(filename, sheet='Sheet2', nc=-1, skip=1, header=True) >>> print(dat) [['head1', 'head2', 'head3', 'head4']] >>> print(sdat) [] >>> dat, sdat = xread(filename, sheet='Sheet2', cname=[' head2', 'head4'], ... snc=[0, 2], skip=1, fill=True, fill_value=-9, ... sfill_value='-8', hstrip=False) >>> print(dat) [[1.4] [2.4] [3.4] [4.4]] """ # Input error if isinstance(nc, int) and isinstance(snc, int): if (nc <= -1) and (snc <= -1): raise ValueError('nc and snc must be integer or list of indices;' ' < 0 means to read the rest of the columns.' ' nc and snc cannot both be < 0.') elif (nc == 0) and (snc == 0) and (cname is None) and (sname is None): raise ValueError('Either: nc != 0, snc != 0, cname is not None, or' ' sname is not None must be given.') # Open file try: import xlrd wb = xlrd.open_workbook(infile) ixls = True except (ModuleNotFoundError, xlrd.biffh.XLRDError): try: import openpyxl # wb = openpyxl.open(infile, read_only=True, data_only=True) wb = openpyxl.open(infile, data_only=True) ixls = False except ModuleNotFoundError: # pragma: no cover # too much hassle to test in special environment only for cover raise IOError('Cannot open file (1) ' + infile) except IOError: raise IOError('Cannot open file (2) ' + infile) # Get Sheet if sheet is None: if ixls: sh = wb.sheet_by_index(0) else: sh = wb[wb.sheetnames[0]] else: if type(sheet) is str: if ixls: sheetnames = wb.sheet_names() else: sheetnames = wb.sheetnames if sheet not in sheetnames: _close_file(wb, ixls=ixls) raise ValueError('Sheet ' + sheet + ' not in Excel file ' + infile) if ixls: sh = wb.sheet_by_name(sheet) else: sh = wb[sheet] else: if ixls: nsheets = wb.nsheets else: nsheets = len(wb.sheetnames) if sheet > nsheets: _close_file(wb, ixls=ixls) raise ValueError(f'Error extracting sheet {str(sheet)}.' f'Only {nsheets} sheets in Excel' f' file {infile}.') if ixls: sh = wb.sheet_by_index(sheet) else: sh = wb[wb.sheetnames[sheet]] rows = _xread_get_iter_rows(sh, ixls=ixls) # Read header and skip lines head = _xread_head(rows, skip, hskip) if ixls: ncol = sh.ncols nrow = sh.nrows - skip else: ncol = sh.max_column nrow = sh.max_row - skip # Read first row res = _xread_next_row(rows) nres = len(res) if not nres: # pragma: no cover # should not happen _close_file(wb, ixls=ixls) raise ValueError('No line to determine separator.') # Determine indices iinc, iisnc = _determine_indices(wb, head, nres, nc=nc, cname=cname, snc=snc, sname=sname, skip=skip, cskip=cskip, hskip=hskip, hstrip=hstrip, sep=None, ixls=ixls) aiinc = list(iinc) aiinc.extend(iisnc) miianc = max(aiinc) # Header if np.isfinite(fill_value): fval = str(fill_value) else: fval = 'NaN' if header: var, svar = _get_header( wb, head, None, iinc, iisnc, squeeze=squeeze, fill=fill, fill_value=fval, sfill_value=sfill_value, strip=strip, full_header=full_header, transpose=transpose, strarr=strarr, ixls=ixls) _close_file(wb, ixls=ixls) return var, svar # Values - first line if (miianc >= nres) and (not fill): # pragma: no cover # should not happen _close_file(wb, ixls=ixls) sres = ';'.join(res) raise ValueError('Line has not enough columns to index: ' + sres) var = list() svar = list() if iinc: null = _line2var(res, var, iinc, False) var[-1] = [ 'NaN' if iv == 'NA' else iv for iv in var[-1] ] if iisnc: null = _line2var(res, svar, iisnc, False if strip is None else strip) # Values - rest of file for iline in range(2, nrow+1): res = _xread_next_row(rows) nres = len(res) if (miianc >= nres) and (not fill): # pragma: no cover # should not happen _close_file(wb, ixls=ixls) sres = ';'.join(res) raise ValueError('Line has not enough columns to index: ' + sres) if iinc: null = _line2var(res, var, iinc, False) var[-1] = [ 'NaN' if iv == 'NA' else iv for iv in var[-1] ] if iisnc: null = _line2var(res, svar, iisnc, False if strip is None else strip) _close_file(wb, ixls=ixls) # Return correct shape and type if var: var = np.array(var, dtype=str) if fill: var = np.where((var == '') | (var == 'None'), fval, var) var = np.array(var, dtype=float) if squeeze: var = var.squeeze() if transpose: var = var.T if return_list: if var.ndim == 1: var = [ i for i in var ] else: var = [ [ var[i, j] for j in range(var.shape[1]) ] for i in range(var.shape[0]) ] if svar: svar = np.array(svar, dtype=str) if fill: svar = np.where((svar == '') | (svar == 'None'), sfill_value, svar) if squeeze: svar = svar.squeeze() if transpose: svar = svar.T if return_list: if svar.ndim == 1: svar = [ i for i in svar ] else: svar = [ [ svar[i, j] for j in range(svar.shape[1]) ] for i in range(svar.shape[0]) ] return var, svar
[docs]def xlsread(*args, **kwargs): """ Wrapper for :func:`xread` """ return xread(*args, **kwargs)
[docs]def xlsxread(*args, **kwargs): """ Wrapper for :func:`xread` """ return xread(*args, **kwargs)
# -------------------------------------------------------------------- if __name__ == '__main__': import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)