Fast read Excel files with pandas

my way to fast read big Excel files with Pandas using cvs cache
pandas
Published

September 14, 2020

Problem description

initial settings

measure_time decorator

from functools import wraps
from time import time
def measure_time(func):
    @wraps(func)
    def _time_it(*args, **kwargs):
        start = int(round(time() * 1000))
        try:
            return func(*args, **kwargs)
        finally:
            end_ = int(round(time() * 1000)) - start
            print(f"Total execution time: {end_ if end_ > 0 else 0} ms")
    return _time_it

read big excel file with pandas

big_excel_file = root_data+'/pandas-caching/big_excel_file.xlsx'
@measure_time
def load_excel(file):
    dataframe = pd.read_excel(file)
    return dataframe
dataframe = load_excel(big_excel_file)
Total execution time: 36196 ms

{% include alert.html text=“Waouh, 36 sec to read this file!” %}

read converted csv file (turned to csv from excel using excel)

csv_file = root_data+'/pandas-caching/big_csv_file_turned_from_excel.csv'
@measure_time
def load_csv(file):
    dataframe = pd.read_csv(file, sep=';', decimal=',')
    return dataframe
df_csv = load_csv(csv_file)
Total execution time: 836 ms

{% include alert.html text=“Much better, 0.8 sec!” %}

Caching library

import os

def read_CachedXLS(filename, forceReload = False, **options):
    """
    Part d'un fichier excel natif (filename).
    Si le dataframe caché correspondant n'existe pas encore, alors sauve le dataframe caché au format csv dans le rep source.
    (s'il existe et si forceReload==True, alors écrase le dataframe caché existant par une nouvelle version)
    Lit le dataframe caché correspondant avec les **options et retourne le dataframe.
    
    Examples
    --------
    >>> filename = '/mnt/z/data/Stam-CC/ExportData 25625.xlsx'
        forceReload = False
        option={'dayfirst':True, 'parse_dates':['Fecha de Medida', 'Fecha de Fabricacion'], 'sheetname':0}
        getCachedXLSRaw(filename, forceRelead, **option).info()

    Parameters
    ----------
    filename : string
        Emplacement du fichier XLS. Avec l'extension. Format complet
        Ex: '/mnt/z/data/Stam-CC/ExportData 25625.xlsx'
    forceReload : boolean, optional, default value = False
        Si forceReload == True, le fichier sera relu et sauvé même s'il existe déjà en cache
    options : **keyword args, optional
        Arguments de lecture du fichier XLS :  https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
        Ex: sheetname=1

    Returns
    -------
    dataframe
        Dataframe correspondant 
    """
    
    #split pour ne garder que le nom sans le chemin de filename : Stam-CC/ExportData 25625 --> ExportData 25625
    dataframe_filename = os.path.dirname(filename)+'/'+os.path.basename(filename)+'.csv'
    #bug de pandas.to_csv quand il y a des espaces ?
    dataframe_filename = dataframe_filename.replace(" ", "_")

    dataframe=[]
    xls_toget = False
    
    #print(dataframe_filename)
    if (forceReload and os.path.exists(dataframe_filename)):
        print("Cached file "+dataframe_filename+" déjà existant mais forceReload=True - FORCE RELOAD")
        xls_toget = True
        
    if (not os.path.exists(dataframe_filename)):
        print("Cached file  "+dataframe_filename+" inexistant - read_CachedXLS")
        xls_toget = True
        
    if (xls_toget):
        dataframe = pd.read_excel(filename, **options)
        dataframe.to_csv(dataframe_filename)
    else:
        print("Cached file "+dataframe_filename+" existe en cache, relecture")
    
    #index_col pour ignorer les n° de lignes excel 
    options['sep']=','
    options['decimal']='.'
    options['skiprows']=0
    options.pop('sheet_name')
    dataframe = pd.read_csv(dataframe_filename,**options)
    return dataframe
option={'sheet_name':0}
read_CachedXLS(big_excel_file, **option)
print("et voila")
Cached file /mnt/z/data//pandas-caching/big_excel_file.xlsx.csv existe en cache, relecture
et voila