from functools import wraps
from time import time
def measure_time(func):
@wraps(func)
def _time_it(*args, **kwargs):
= int(round(time() * 1000))
start try:
return func(*args, **kwargs)
finally:
= int(round(time() * 1000)) - start
end_ print(f"Total execution time: {end_ if end_ > 0 else 0} ms")
return _time_it
Problem description
initial settings
measure_time decorator
read big excel file with pandas
= root_data+'/pandas-caching/big_excel_file.xlsx' big_excel_file
@measure_time
def load_excel(file):
= pd.read_excel(file)
dataframe return dataframe
= load_excel(big_excel_file) dataframe
Total execution time: 36196 ms
{% include alert.html text=“Waouh, 36 sec to read this file!” %}
read converted csv file (turned to csv from excel using excel)
= root_data+'/pandas-caching/big_csv_file_turned_from_excel.csv' csv_file
@measure_time
def load_csv(file):
= pd.read_csv(file, sep=';', decimal=',')
dataframe return dataframe
= load_csv(csv_file) df_csv
Total execution time: 836 ms
{% include alert.html text=“Much better, 0.8 sec!” %}
Caching library
import os
def read_CachedXLS(filename, forceReload = False, **options):
"""
Part d'un fichier excel natif (filename).
Si le dataframe caché correspondant n'existe pas encore, alors sauve le dataframe caché au format csv dans le rep source.
(s'il existe et si forceReload==True, alors écrase le dataframe caché existant par une nouvelle version)
Lit le dataframe caché correspondant avec les **options et retourne le dataframe.
Examples
--------
>>> filename = '/mnt/z/data/Stam-CC/ExportData 25625.xlsx'
forceReload = False
option={'dayfirst':True, 'parse_dates':['Fecha de Medida', 'Fecha de Fabricacion'], 'sheetname':0}
getCachedXLSRaw(filename, forceRelead, **option).info()
Parameters
----------
filename : string
Emplacement du fichier XLS. Avec l'extension. Format complet
Ex: '/mnt/z/data/Stam-CC/ExportData 25625.xlsx'
forceReload : boolean, optional, default value = False
Si forceReload == True, le fichier sera relu et sauvé même s'il existe déjà en cache
options : **keyword args, optional
Arguments de lecture du fichier XLS : https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
Ex: sheetname=1
Returns
-------
dataframe
Dataframe correspondant
"""
#split pour ne garder que le nom sans le chemin de filename : Stam-CC/ExportData 25625 --> ExportData 25625
= os.path.dirname(filename)+'/'+os.path.basename(filename)+'.csv'
dataframe_filename #bug de pandas.to_csv quand il y a des espaces ?
= dataframe_filename.replace(" ", "_")
dataframe_filename
=[]
dataframe= False
xls_toget
#print(dataframe_filename)
if (forceReload and os.path.exists(dataframe_filename)):
print("Cached file "+dataframe_filename+" déjà existant mais forceReload=True - FORCE RELOAD")
= True
xls_toget
if (not os.path.exists(dataframe_filename)):
print("Cached file "+dataframe_filename+" inexistant - read_CachedXLS")
= True
xls_toget
if (xls_toget):
= pd.read_excel(filename, **options)
dataframe
dataframe.to_csv(dataframe_filename)else:
print("Cached file "+dataframe_filename+" existe en cache, relecture")
#index_col pour ignorer les n° de lignes excel
'sep']=','
options['decimal']='.'
options['skiprows']=0
options['sheet_name')
options.pop(= pd.read_csv(dataframe_filename,**options)
dataframe return dataframe
={'sheet_name':0}
option**option)
read_CachedXLS(big_excel_file, print("et voila")
Cached file /mnt/z/data//pandas-caching/big_excel_file.xlsx.csv existe en cache, relecture
et voila