I have a very large DataFrame that needs to be saved in the hdf5 file. Generate_data() is the function of generating data. In the read_unique_date() function, store.df.date.drop_duplicates() needs to be executed for 5 seconds. Also after using mysql, execute select distinct date from table only takes 80ms.
How can I create and use an index in the HDF5 file?
My code is as follows:
import pandas as pd
import numpy as np
from itertools import product
from time import time
def generate_data():
np.random.seed(202108)
date = pd.date_range(start="19900101", end="20210723", freq="D")
date = pd.DataFrame(date, columns=["date"])
code = pd.DataFrame(range(5000), columns=["code"])
# generate product of the two columns:
df = pd.DataFrame(product(date["date"], code["code"]), columns=["date", "code"])
df['data'] = np.random.random(len(df))
return df
def save_data(filename, df):
store = pd.HDFStore(filename)
store['df'] = df
store.close()
return
def read_unique_date(file_name):
store = pd.HDFStore(file_name)
start = time()
df = store.df.date.drop_duplicates()
store.close()
stop = time()
print(stop - start)
return df
def main():
path = 'd:\\'
file = 'large data.h5'
file_name = path + file
df = generate_data()
save_data(file_name, df)
df1 = read_unique_date(file_name)
return
if __name__ == '__main__':
main()
from How to create and use indexes for HDF5 files?
No comments:
Post a Comment