Benchmarking full and chunked file reading#

In this notebook, a benchmark on three file types (DAT, EVT2 and EVT3) is run

from expelliarmus import Wizard
import pathlib
import h5py
import numpy as np
import timeit
import requests
import pickle

FIRST_RUN = True
SAVE_RESULTS = True
LOAD_RESULTS = False
REPEAT = 10
def get_diff_perc_str(ref, val):
    if (val > ref):
        return f"+{round((val/ref-1)*100)}%"
    else:
        return f"-{round((1-val/ref)*100)}%"
    
get_fsize_MB = lambda fpath: round(fpath.stat().st_size/(1024*1024))
fname = "driving_sample"
get_fpath = lambda encoding: f"{fname}_{encoding}.{'raw' if encoding!='dat' else 'dat'}"

if FIRST_RUN:
    # Downloading files.
    print("Downloading EVT3 file... ", end="")
    if not pathlib.Path(get_fpath('evt3')).is_file():
        r = requests.get("https://dataset.prophesee.ai/index.php/s/nVcLLdWAnNzrmII/download", allow_redirects=True) # spinner.dat, DAT
        open(get_fpath('evt3'), 'wb').write(r.content)
    print("done!")
Downloading EVT3 file... done!
softwares = ("hdf5", "hdf5_lzf", "hdf5_gzip", "numpy")
encodings = ("dat", "evt2", "evt3")

print("Converting the EVT3 file to EVT2 and DAT formats... ", end="")
wizard = Wizard(encoding="evt3")
evt3_arr = wizard.read(get_fpath('evt3'))
for encoding in ("dat", "evt2"):
    wizard.set_encoding(encoding)
    wizard.save(fpath=get_fpath(encoding), arr=evt3_arr)
print("done!")
Converting the EVT3 file to EVT2 and DAT formats... done!
data = dict(
    expelliarmus=dict(dat=dict(fsize=0, full=0, windowed=0, chunked=0), 
                      evt2=dict(fsize=0, full=0, windowed=0, chunked=0), 
                      evt3=dict(fsize=0, full=0, windowed=0, chunked=0),
                     ),
    hdf5=dict(fsize=0, full=0, windowed=0, chunked=0),
    hdf5_lzf=dict(fsize=0, full=0, windowed=0, chunked=0),
    hdf5_gzip=dict(fsize=0, full=0, windowed=0, chunked=0),
    numpy=dict(fsize=0, full=0),
)
if LOAD_RESULTS:
    data = pickle.load(open("./benchmark.pk", "rb"))

print("Full file read")
for encoding in encodings:

    if not LOAD_RESULTS:  
        fpath = pathlib.Path(get_fpath(encoding))
        wizard.set_encoding(encoding)
        wizard.set_file(fpath)
        if FIRST_RUN:
            arr = wizard.read()
            
        data["expelliarmus"][encoding]["fsize"] = get_fsize_MB(fpath)       
        data["expelliarmus"][encoding]["full"] = sum(timeit.repeat(lambda: wizard.read(), number=1, repeat=REPEAT))/REPEAT

# HDF5 formats.
if FIRST_RUN:
    for sw in softwares[:-1]:
        fpath = pathlib.Path(f"ref_{sw}.hdf5")
        fp = h5py.File(fpath, "w")
        if sw=="hdf5":
            arr_hdf5 = fp.create_dataset("arr", arr.shape, arr.dtype)
        elif sw=="hdf5_lzf":
            arr_hdf5 = fp.create_dataset("arr", arr.shape, arr.dtype, compression="lzf")
        elif sw=="hdf5_gzip":
            arr_hdf5 = fp.create_dataset("arr", arr.shape, arr.dtype, compression="gzip")
            
        arr_hdf5[:] = arr[:]
        fp.close()
        data[sw]["fsize"] = get_fsize_MB(fpath)
        fp = h5py.File(fpath)
        data[sw]["full"] = sum(timeit.repeat(lambda: fp["arr"][:], number=1, repeat=REPEAT))/REPEAT
        fp.close()

    # NumPy
    fpath = pathlib.Path("ref_np.npy")
    np.save(fpath, arr, allow_pickle=False)
    data["numpy"]["fsize"] = get_fsize_MB(fpath)
    data["numpy"]["full"] = sum(timeit.repeat(lambda: np.load(fpath), number=1, repeat=REPEAT))/REPEAT
    
# Printing results.

def get_spacing (header, printed):
    if isinstance(printed, float):
        return " "*(len(header)+1 - len(f"{printed:.3f}"))
    else:
        return " "*(len(header)+1 - len(str(printed)))
    
def gen_row(sw_name, size_value, time_value, mode):
    exp_dict = data["expelliarmus"]
    dat_fsize, evt2_fsize, evt3_fsize = exp_dict["dat"]["fsize"], exp_dict["evt2"]["fsize"], exp_dict["evt3"]["fsize"]
    dat_time, evt2_time, evt3_time = exp_dict["dat"][mode], exp_dict["evt2"][mode], exp_dict["evt3"][mode]
    return f'{sw_name}{get_spacing("Software ", sw_name)}| \
{size_value}{get_spacing("Size [MB]", size_value)}| \
{get_diff_perc_str(dat_fsize, size_value)}{get_spacing("Diff. DAT", get_diff_perc_str(dat_fsize, size_value))}| \
{get_diff_perc_str(evt2_fsize, size_value)}{get_spacing("Diff. EVT2", get_diff_perc_str(evt2_fsize, size_value))}| \
{get_diff_perc_str(evt3_fsize, size_value)}{get_spacing("Diff. EVT3", get_diff_perc_str(evt3_fsize, size_value))}| \
{time_value:.2f}{get_spacing("Time [s]", time_value)}| \
{get_diff_perc_str(dat_time, time_value)}{get_spacing("Diff. DAT", get_diff_perc_str(dat_time, time_value))}| \
{get_diff_perc_str(evt2_time, time_value)}{get_spacing("Diff. EVT2", get_diff_perc_str(evt2_time, time_value))}| \
{get_diff_perc_str(evt3_time, time_value)}{get_spacing("Diff. EVT3", get_diff_perc_str(evt3_time, time_value))}'  
                                                                                  
header = f"Software  | Size [MB] | Diff. DAT | Diff. EVT2 | Diff. EVT3 | Time [s] | Diff. DAT | Diff. EVT2 | Diff. EVT3"
print("-"*len(header))
print(header)
print("-"*len(header))

for encoding in encodings:
    print(gen_row(f"exp. {encoding.upper()}", data["expelliarmus"][encoding]["fsize"], data["expelliarmus"][encoding]["full"], "full"))
    print("-"*len(header))

for sw in softwares: 
    print(gen_row(sw, data[sw]["fsize"], data[sw]["full"], "full"))
    print("-"*len(header))
Full file read
------------------------------------------------------------------------------------------------------------
Software  | Size [MB] | Diff. DAT | Diff. EVT2 | Diff. EVT3 | Time [s] | Diff. DAT | Diff. EVT2 | Diff. EVT3
------------------------------------------------------------------------------------------------------------
exp. DAT  | 851       | -0%       | +100%      | +143%      | 1.15    | -0%       | +43%       | -41%       
------------------------------------------------------------------------------------------------------------
exp. EVT2 | 426       | -50%      | -0%        | +22%       | 0.80    | -30%      | -0%        | -59%       
------------------------------------------------------------------------------------------------------------
exp. EVT3 | 350       | -59%      | -18%       | -0%        | 1.95    | +70%      | +144%      | -0%        
------------------------------------------------------------------------------------------------------------
hdf5      | 1701      | +100%     | +299%      | +386%      | 0.73    | -36%      | -8%        | -62%       
------------------------------------------------------------------------------------------------------------
hdf5_lzf  | 746       | -12%      | +75%       | +113%      | 3.09    | +170%     | +287%      | +58%       
------------------------------------------------------------------------------------------------------------
hdf5_gzip | 419       | -51%      | -2%        | +20%       | 5.60    | +389%     | +600%      | +187%      
------------------------------------------------------------------------------------------------------------
numpy     | 1701      | +100%     | +299%      | +386%      | 0.32    | -72%      | -60%       | -84%       
------------------------------------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.style.use('classic')
%matplotlib inline
plt.clf()
fig = plt.figure(figsize=(12, 8), dpi=120)

plt.xlabel("File size [MB]")
plt.ylabel("Read time [s]")
plt.title("DAT, EVT2 and EVT3 encodings (whole file)")

exp_colors = dict(dat="gold", evt2="orange", evt3="tomato")
sw_colors = dict(hdf5="royalblue", hdf5_lzf="limegreen", hdf5_gzip="mediumorchid", numpy="hotpink")
sw_markers = dict(hdf5=">", hdf5_lzf="^", hdf5_gzip="v", numpy="*")

# Expelliarmus
for encoding in encodings:
     plt.scatter(data["expelliarmus"][encoding]["fsize"], data["expelliarmus"][encoding]["full"], marker="o", s=240, color=exp_colors[encoding], label=f"expelliarmus - {encoding.upper()}")

# Other softwares.
for sw in softwares:
    plt.scatter(data[sw]["fsize"], data[sw]["full"], marker=sw_markers[sw], s=240, color=sw_colors[sw], label=sw)

plt.legend()
plt.show()
<Figure size 432x288 with 0 Axes>
../_images/86418cfe7dbaa8d2df459ecdad98119ef8e32699c437576e43895f85dd72bcb0.png
TIME_WINDOW = 20

print("Time windowing read")
arr_len = len(wizard.read(get_fpath("evt3")))
for encoding in encodings:

    if not LOAD_RESULTS:  
        fpath = pathlib.Path(get_fpath(encoding))
        wizard.set_encoding(encoding)
        wizard.set_file(fpath)
        wizard.set_time_window(TIME_WINDOW)
        time_window_length = len(next(wizard.read_time_window()))
        def fn():
            wizard.reset()
            return [window for window in wizard.read_time_window()]
            
        data["expelliarmus"][encoding]["windowed"] = sum(timeit.repeat(fn, number=1, repeat=REPEAT))/REPEAT

# HDF5 formats.
for sw in softwares[:-1]:
    fpath = pathlib.Path(f"ref_{sw}.hdf5")
    fp = h5py.File(fpath)
    data[sw]["windowed"] = sum(timeit.repeat(lambda: [fp["arr"][i*time_window_length:min(arr_len, (i+1)*time_window_length)] for i in range(arr_len//time_window_length)], number=1, repeat=REPEAT))/REPEAT
    fp.close()

header = f"Software  | Size [MB] | Diff. DAT | Diff. EVT2 | Diff. EVT3 | Time [s] | Diff. DAT | Diff. EVT2 | Diff. EVT3"
print("-"*len(header))
print(header)
print("-"*len(header))

for encoding in encodings:
    print(gen_row(f"exp. {encoding.upper()}", data["expelliarmus"][encoding]["fsize"], data["expelliarmus"][encoding]["windowed"], "windowed"))
    print("-"*len(header))

for sw in softwares[:-1]: 
    print(gen_row(sw, data[sw]["fsize"], data[sw]["windowed"], "windowed"))
    print("-"*len(header))
Time windowing read
------------------------------------------------------------------------------------------------------------
Software  | Size [MB] | Diff. DAT | Diff. EVT2 | Diff. EVT3 | Time [s] | Diff. DAT | Diff. EVT2 | Diff. EVT3
------------------------------------------------------------------------------------------------------------
exp. DAT  | 851       | -0%       | +100%      | +143%      | 1.58    | -0%       | +4%        | -39%       
------------------------------------------------------------------------------------------------------------
exp. EVT2 | 426       | -50%      | -0%        | +22%       | 1.51    | -4%       | -0%        | -42%       
------------------------------------------------------------------------------------------------------------
exp. EVT3 | 350       | -59%      | -18%       | -0%        | 2.58    | +64%      | +71%       | -0%        
------------------------------------------------------------------------------------------------------------
hdf5      | 1701      | +100%     | +299%      | +386%      | 1.02    | -35%      | -32%       | -60%       
------------------------------------------------------------------------------------------------------------
hdf5_lzf  | 746       | -12%      | +75%       | +113%      | 3.82    | +143%     | +153%      | +48%       
------------------------------------------------------------------------------------------------------------
hdf5_gzip | 419       | -51%      | -2%        | +20%       | 6.88    | +337%     | +355%      | +166%      
------------------------------------------------------------------------------------------------------------
plt.clf()
fig = plt.figure(figsize=(12, 8), dpi=120)

plt.xlabel("File size [MB]")
plt.ylabel("Read time [s]")
plt.title("DAT, EVT2 and EVT3 encodings (time-windowed file)")

# Expelliarmus
for encoding in encodings:
     plt.scatter(data["expelliarmus"][encoding]["fsize"], data["expelliarmus"][encoding]["windowed"], marker="o", s=240, color=exp_colors[encoding], label=f"expelliarmus - {encoding.upper()}")

# Other softwares.
for sw in softwares[:-1]:
    plt.scatter(data[sw]["fsize"], data[sw]["windowed"], marker=sw_markers[sw], s=240, color=sw_colors[sw], label=sw)

plt.legend()
plt.show()
<Figure size 432x288 with 0 Axes>
../_images/780640e647b627f7cd9dc2135d676dd09ea9600b329adc78c75ab9ea23b71150.png
CHUNK_LEN = 8192

print("Chunk read")
arr_len = len(wizard.read(get_fpath("evt3")))
for encoding in encodings:

    if not LOAD_RESULTS:  
        fpath = pathlib.Path(get_fpath(encoding))
        wizard.set_encoding(encoding)
        wizard.set_file(fpath)
        wizard.set_chunk_size(CHUNK_LEN)
        def fn():
            wizard.reset()
            return [chunk for chunk in wizard.read_chunk()]
            
        data["expelliarmus"][encoding]["chunked"] = sum(timeit.repeat(fn, number=1, repeat=REPEAT))/REPEAT

# HDF5 formats.
for sw in softwares[:-1]:
    fpath = pathlib.Path(f"ref_{sw}.hdf5")
    fp = h5py.File(fpath)
    data[sw]["chunked"] = sum(timeit.repeat(lambda: [fp["arr"][i*CHUNK_LEN:min(arr_len, (i+1)*CHUNK_LEN)] for i in range(arr_len//CHUNK_LEN)], number=1, repeat=REPEAT))/REPEAT
    fp.close()

header = f"Software  | Size [MB] | Diff. DAT | Diff. EVT2 | Diff. EVT3 | Time [s] | Diff. DAT | Diff. EVT2 | Diff. EVT3"
print("-"*len(header))
print(header)
print("-"*len(header))

for encoding in encodings:
    print(gen_row(f"exp. {encoding.upper()}", data["expelliarmus"][encoding]["fsize"], data["expelliarmus"][encoding]["chunked"], "chunked"))
    print("-"*len(header))

for sw in softwares[:-1]: 
    print(gen_row(sw, data[sw]["fsize"], data[sw]["chunked"], "chunked"))
    print("-"*len(header))
Chunk read
------------------------------------------------------------------------------------------------------------
Software  | Size [MB] | Diff. DAT | Diff. EVT2 | Diff. EVT3 | Time [s] | Diff. DAT | Diff. EVT2 | Diff. EVT3
------------------------------------------------------------------------------------------------------------
exp. DAT  | 851       | -0%       | +100%      | +143%      | 1.64    | -0%       | +3%        | -22%       
------------------------------------------------------------------------------------------------------------
exp. EVT2 | 426       | -50%      | -0%        | +22%       | 1.58    | -3%       | -0%        | -24%       
------------------------------------------------------------------------------------------------------------
exp. EVT3 | 350       | -59%      | -18%       | -0%        | 2.09    | +28%      | +32%       | -0%        
------------------------------------------------------------------------------------------------------------
hdf5      | 1701      | +100%     | +299%      | +386%      | 4.20    | +157%     | +166%      | +101%      
------------------------------------------------------------------------------------------------------------
hdf5_lzf  | 746       | -12%      | +75%       | +113%      | 10.36   | +534%     | +555%      | +395%      
------------------------------------------------------------------------------------------------------------
hdf5_gzip | 419       | -51%      | -2%        | +20%       | 17.23   | +954%     | +989%      | +724%      
------------------------------------------------------------------------------------------------------------
plt.clf()
fig = plt.figure(figsize=(12, 8), dpi=120)

plt.xlabel("File size [MB]")
plt.ylabel("Read time [s]")
plt.title("DAT, EVT2 and EVT3 encodings (chunked file)")

# Expelliarmus
for encoding in encodings:
     plt.scatter(data["expelliarmus"][encoding]["fsize"], data["expelliarmus"][encoding]["chunked"], marker="o", s=240, color=exp_colors[encoding], label=f"expelliarmus - {encoding.upper()}")

# Other softwares.
for sw in softwares[:-1]:
    plt.scatter(data[sw]["fsize"], data[sw]["chunked"], marker=sw_markers[sw], s=240, color=sw_colors[sw], label=sw)

plt.legend()
plt.show()
<Figure size 432x288 with 0 Axes>
../_images/274b2d887445d51db31d147c5922f67d72335141e2a2b8c4d9100b8f479c92d4.png
if SAVE_RESULTS:
    pickle.dump(data, open("./benchmark.pk", "wb"))