diff --git a/benchmarks/pandas/bench_at_iat.py b/benchmarks/pandas/bench_at_iat.py new file mode 100644 index 00000000..662c5e43 --- /dev/null +++ b/benchmarks/pandas/bench_at_iat.py @@ -0,0 +1,37 @@ +"""Benchmark: Series.at, Series.iat, DataFrame.at, DataFrame.iat — fast scalar access""" +import json +import time +import pandas as pd + +N = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +labels = [f"r{i}" for i in range(N)] +values = [i * 1.5 for i in range(N)] + +s = pd.Series(values, index=labels) +df = pd.DataFrame({"a": values, "b": [v * 2 for v in values]}, index=labels) + +mid_label = f"r{N // 2}" + +for _ in range(WARMUP): + _ = s.at[mid_label] + _ = s.iat[N // 2] + _ = df.at[mid_label, "a"] + _ = df.iat[N // 2, 0] + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.at[mid_label] + _ = s.iat[N // 2] + _ = df.at[mid_label, "a"] + _ = df.iat[N // 2, 0] +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "at_iat", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_autocorr.py b/benchmarks/pandas/bench_autocorr.py new file mode 100644 index 00000000..ee5c00e0 --- /dev/null +++ b/benchmarks/pandas/bench_autocorr.py @@ -0,0 +1,37 @@ +""" +Benchmark: Series.autocorr(lag) — lag-N autocorrelation for a 100k-element numeric Series. + +Mirrors tsb autoCorr. +Benchmarks lag=1, lag=5, and lag=20. +Outputs JSON: {"function": "autocorr", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = [math.sin(i * 0.05) + (i % 7) * 0.01 for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.autocorr(lag=1) + s.autocorr(lag=5) + s.autocorr(lag=20) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.autocorr(lag=1) + s.autocorr(lag=5) + s.autocorr(lag=20) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "autocorr", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_convert_dtypes.py b/benchmarks/pandas/bench_convert_dtypes.py new file mode 100644 index 00000000..543fa870 --- /dev/null +++ b/benchmarks/pandas/bench_convert_dtypes.py @@ -0,0 +1,50 @@ +""" +Benchmark: pandas Series.convert_dtypes() and DataFrame.convert_dtypes() + +Creates a 50k-row dataset with object-dtype numeric, boolean, and string +columns, then measures how fast pandas can infer and convert to best dtypes. +""" +import json +import time +import numpy as np +import pandas as pd + +N = 50_000 +WARMUP = 3 +ITERATIONS = 20 + +# Object-dtype arrays (same structure as the TypeScript version) +int_data = [None if i % 17 == 0 else i for i in range(N)] +float_data = [None if i % 13 == 0 else i * 1.5 for i in range(N)] +str_data = [None if i % 11 == 0 else f"str_{i}" for i in range(N)] +bool_data = [None if i % 7 == 0 else (i % 2 == 0) for i in range(N)] + +int_series = pd.Series(int_data, dtype=object) +float_series = pd.Series(float_data, dtype=object) + +df = pd.DataFrame({ + "int_col": int_data, + "float_col": float_data, + "str_col": str_data, + "bool_col": bool_data, +}) + +# Warm-up +for _ in range(WARMUP): + int_series.convert_dtypes() + float_series.convert_dtypes() + df.convert_dtypes() + +start = time.perf_counter() +for _ in range(ITERATIONS): + int_series.convert_dtypes() + float_series.convert_dtypes() + df.convert_dtypes() +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "convert_dtypes", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_cross_join.py b/benchmarks/pandas/bench_cross_join.py new file mode 100644 index 00000000..ad1de45b --- /dev/null +++ b/benchmarks/pandas/bench_cross_join.py @@ -0,0 +1,32 @@ +"""Benchmark: cross_join — Cartesian product of two 300-row DataFrames (90k result rows)""" +import json +import time +import pandas as pd + +N = 300 +WARMUP = 3 +ITERATIONS = 10 + +left = pd.DataFrame({ + "id_a": list(range(N)), + "val_a": [i * 1.5 for i in range(N)], +}) +right = pd.DataFrame({ + "id_b": list(range(N)), + "val_b": [i * 2.5 for i in range(N)], +}) + +for _ in range(WARMUP): + pd.merge(left, right, how="cross") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.merge(left, right, how="cross") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "cross_join", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_cut_bins_to_frame.py b/benchmarks/pandas/bench_cut_bins_to_frame.py new file mode 100644 index 00000000..5ae5908c --- /dev/null +++ b/benchmarks/pandas/bench_cut_bins_to_frame.py @@ -0,0 +1,56 @@ +"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +NUM_BINS = 20 +WARMUP = 5 +ITERATIONS = 50 + +data = np.array([(i % 1000) * 0.1 for i in range(SIZE)]) + +for _ in range(WARMUP): + # pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result + cut_result = pd.cut(data, NUM_BINS) + # Summary DataFrame equivalent to cutBinsToFrame + counts = cut_result.value_counts(sort=False) + summary = pd.DataFrame({ + "bin": counts.index.astype(str), + "left": [iv.left for iv in counts.index], + "right": [iv.right for iv in counts.index], + "count": counts.values, + "frequency": counts.values / len(data), + }) + # cutBinCounts equivalent: counts dict + count_dict = dict(zip(counts.index.astype(str), counts.values)) + # binEdges equivalent: DataFrame of interval edges + edges = pd.DataFrame({ + "left": [iv.left for iv in counts.index], + "right": [iv.right for iv in counts.index], + }) + +start = time.perf_counter() +for _ in range(ITERATIONS): + cut_result = pd.cut(data, NUM_BINS) + counts = cut_result.value_counts(sort=False) + summary = pd.DataFrame({ + "bin": counts.index.astype(str), + "left": [iv.left for iv in counts.index], + "right": [iv.right for iv in counts.index], + "count": counts.values, + "frequency": counts.values / len(data), + }) + count_dict = dict(zip(counts.index.astype(str), counts.values)) + edges = pd.DataFrame({ + "left": [iv.left for iv in counts.index], + "right": [iv.right for iv in counts.index], + }) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "cut_bins_to_frame", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_compare_pair.py b/benchmarks/pandas/bench_dataframe_compare_pair.py new file mode 100644 index 00000000..4dd28ff4 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_compare_pair.py @@ -0,0 +1,50 @@ +""" +Benchmark: DataFrame-to-DataFrame element-wise comparisons. + +The existing dataframe_compare benchmark tests scalar comparisons only. +This tests df1.eq(df2), df1.ne(df2), df1.gt(df2), df1.le(df2) (DataFrame vs DataFrame). +Mirrors tsb dataFrameEq(df1, df2), dataFrameNe, dataFrameGt, dataFrameLe. + +Outputs JSON: {"function": "dataframe_compare_pair", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +df1 = pd.DataFrame({ + "a": np.array([(i * 1.7) % 1000 for i in range(SIZE)]), + "b": np.array([(i * 2.3) % 1000 for i in range(SIZE)]), + "c": np.array([i % 100 for i in range(SIZE)]), +}) + +df2 = pd.DataFrame({ + "a": np.array([(i * 2.1) % 1000 for i in range(SIZE)]), + "b": np.array([(i * 1.9) % 1000 for i in range(SIZE)]), + "c": np.array([(i + 7) % 100 for i in range(SIZE)]), +}) + +for _ in range(WARMUP): + df1.eq(df2) + df1.ne(df2) + df1.gt(df2) + df1.le(df2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df1.eq(df2) + df1.ne(df2) + df1.gt(df2) + df1.le(df2) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_compare_pair", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_dataframe_itertuples.py b/benchmarks/pandas/bench_dataframe_itertuples.py new file mode 100644 index 00000000..18ac5108 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_itertuples.py @@ -0,0 +1,29 @@ +"""Benchmark: DataFrame.itertuples() — iterate over rows as namedtuples.""" +import time +import pandas as pd + +ROWS = 1_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "x": [i * 1.5 for i in range(ROWS)], + "y": [i * 2.5 for i in range(ROWS)], + "z": [i * 3.5 for i in range(ROWS)], +}) + +for _ in range(WARMUP): + for _row in df.itertuples(): + pass + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for _row in df.itertuples(): + pass + times.append(time.perf_counter() - t0) + +total = sum(times) +mean_ms = (total / ITERATIONS) * 1000 +total_ms = total * 1000 +print(f'{{"function": "dataframe_itertuples", "mean_ms": {mean_ms:.6f}, "iterations": {ITERATIONS}, "total_ms": {total_ms:.6f}}}') diff --git a/benchmarks/pandas/bench_dataframe_transform_named.py b/benchmarks/pandas/bench_dataframe_transform_named.py new file mode 100644 index 00000000..045650e9 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_transform_named.py @@ -0,0 +1,40 @@ +""" +Benchmark: pandas DataFrame.transform() with named aggregation strings. + +Mirrors tsb dataFrameTransform with string names like "mean", "cumsum", +and ["sum", "mean"] applied column-wise. + +Uses 10k-row DataFrame to match the TypeScript benchmark. +""" +import json +import time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +a = [(i % 100) * 1.5 + 1 for i in range(ROWS)] +b = [((i * 3) % 200) * 0.5 + 2 for i in range(ROWS)] +c = [((i * 7) % 50) * 2.0 + 0.5 for i in range(ROWS)] +df = pd.DataFrame({"a": a, "b": b, "c": c}) + +# Warm-up +for _ in range(WARMUP): + df.transform("mean") + df.transform("cumsum") + df.transform(["sum", "mean"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.transform("mean") + df.transform("cumsum") + df.transform(["sum", "mean"]) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_transform_named", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_dataframe_update.py b/benchmarks/pandas/bench_dataframe_update.py new file mode 100644 index 00000000..cea97283 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_update.py @@ -0,0 +1,48 @@ +""" +Benchmark: DataFrame.update() — in-place-style DataFrame value update. + +Mirrors tsb dataFrameUpdate. +Overwrites non-null values from `other` into `self`. +Outputs JSON: {"function": "dataframe_update", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" + +import json +import time + +import numpy as np +import pandas as pd + +N = 10_000 +WARMUP = 20 +ITERATIONS = 200 + +# Build two DataFrames; `other` has NaN in ~2/3 of rows (so 1/3 rows are updated). +a_data = [i * 1.0 for i in range(N)] +b_data = [i * 2.0 for i in range(N)] +a_other = [i * 10.0 if i % 3 == 0 else np.nan for i in range(N)] +b_other = [i * 20.0 if i % 3 == 0 else np.nan for i in range(N)] + +df = pd.DataFrame({"a": a_data, "b": b_data}) +other = pd.DataFrame({"a": a_other, "b": b_other}) + +# Warm-up +for _ in range(WARMUP): + dc = df.copy() + dc.update(other) + +start = time.perf_counter() +for _ in range(ITERATIONS): + dc = df.copy() + dc.update(other) +total_ms = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "dataframe_update", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/pandas/bench_errors.py b/benchmarks/pandas/bench_errors.py new file mode 100644 index 00000000..b5a28b35 --- /dev/null +++ b/benchmarks/pandas/bench_errors.py @@ -0,0 +1,55 @@ +"""Benchmark: pd.errors namespace — instantiate and inspect pandas-compatible error classes. + +Mirrors tsb's errors namespace: create error instances, check isinstance, .name and .message. +""" +import json +import time +import pandas.errors as pd_errors + +WARMUP = 5 +ITERATIONS = 200 + + +def _run(): + e1 = ValueError("bad value") + e2 = KeyError("missing key") + e3 = pd_errors.MergeError("incompatible merge") + e4 = pd_errors.EmptyDataError("no data") + e5 = pd_errors.OptionError("unknown option") + e6 = pd_errors.IntCastingNaNError() + e7 = pd_errors.UnsortedIndexError("MultiIndex slicing requires the index to be lexsorted") + e8 = pd_errors.ParserError("unexpected token") + e9 = pd_errors.PerformanceWarning("slow path") + e10 = pd_errors.InvalidIndexError("bad index") + + _a = isinstance(e1, ValueError) + _b = isinstance(e2, KeyError) + _c = isinstance(e3, Exception) + _d = type(e4).__name__ == "EmptyDataError" + _e = "unknown" in str(e5) + _f = isinstance(e6, pd_errors.IntCastingNaNError) + _g = isinstance(e7, pd_errors.UnsortedIndexError) + _h = type(e8).__name__ == "ParserError" + _i = type(e9).__name__ == "PerformanceWarning" + _j = isinstance(e10, pd_errors.InvalidIndexError) + return [_a, _b, _c, _d, _e, _f, _g, _h, _i, _j] + + +for _ in range(WARMUP): + _run() + +start = time.perf_counter() +for _ in range(ITERATIONS): + _run() +total_ms = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "errors", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/pandas/bench_extensions.py b/benchmarks/pandas/bench_extensions.py new file mode 100644 index 00000000..5aa00fe9 --- /dev/null +++ b/benchmarks/pandas/bench_extensions.py @@ -0,0 +1,136 @@ +"""Benchmark: pd.api.extensions — ExtensionDtype / ExtensionArray subclassing and +accessor registration. + +Mirrors tsb's extensions benchmark: + - ExtensionDtype subclass construction + - ExtensionArray subclass instantiation, getitem, slice, dtype access + - register_extension_dtype() → tsb registerExtensionDtype() + - register_series_accessor() → tsb registerSeriesAccessor() + - register_dataframe_accessor() → tsb registerDataFrameAccessor() + - Accessor registry introspection via hasattr +""" +import json +import time +import numpy as np +import pandas as pd +import pandas.api.extensions as pd_ext + +WARMUP = 5 +ITERATIONS = 200 + + +@pd_ext.register_extension_dtype +class TagDtype(pd_ext.ExtensionDtype): + name = "tag" + type = object + kind = "O" + + @classmethod + def construct_array_type(cls): + return TagArray + + @classmethod + def construct_from_string(cls, string): + if string == "tag": + return cls() + raise TypeError(f"Cannot construct a 'TagDtype' from '{string}'") + + +class TagArray(pd_ext.ExtensionArray): + def __init__(self, data): + self._data = np.asarray(data, dtype=object) + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + return cls(scalars) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values) + + def __getitem__(self, key): + return self._data[key] + + def __setitem__(self, key, value): + self._data[key] = value + + def __len__(self): + return len(self._data) + + @property + def dtype(self): + return TagDtype() + + @property + def nbytes(self): + return self._data.nbytes + + def isna(self): + return np.array([v is None for v in self._data]) + + def take(self, indices, *, allow_fill=False, fill_value=None): + return type(self)(self._data.take(indices)) + + def copy(self): + return type(self)(self._data.copy()) + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate([a._data for a in to_concat])) + + +@pd_ext.register_series_accessor("geo_bench") +class GeoAccessor: + def __init__(self, obj): + self._obj = obj + + def distance(self): + return 0 + + +@pd_ext.register_dataframe_accessor("geo_bench") +class GeoDataFrameAccessor: + def __init__(self, obj): + self._obj = obj + + def distance(self): + return 0 + + +_TAGS = ["alpha", "beta", "gamma", "delta", "epsilon"] +_s = pd.Series(TagArray(_TAGS)) +_df = pd.DataFrame({"a": [1, 2, 3]}) + + +def _run(): + arr = TagArray(_TAGS) + _len = len(arr) + _item = arr[2] + _sliced = arr[1:4] + _dtype_name = arr.dtype.name + _numeric = False + + _has_series = hasattr(_s, "geo_bench") + _has_df = hasattr(_df, "geo_bench") + + return [_len, _item, _sliced, _dtype_name, _numeric, _has_series, _has_df] + + +for _ in range(WARMUP): + _run() + +start = time.perf_counter() +for _ in range(ITERATIONS): + _run() +total_ms = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "extensions", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/pandas/bench_filter_series.py b/benchmarks/pandas/bench_filter_series.py new file mode 100644 index 00000000..ec653243 --- /dev/null +++ b/benchmarks/pandas/bench_filter_series.py @@ -0,0 +1,31 @@ +"""Benchmark: Series.filter — filter Series index labels by items/like/regex""" +import json +import time +import pandas as pd + +N = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +labels = [f"label_{i}" for i in range(N)] +values = [i * 0.5 for i in range(N)] +s = pd.Series(values, index=labels) + +keep_items = [f"label_{i * 100}" for i in range(1_000)] + +for _ in range(WARMUP): + s.filter(items=keep_items) + s.filter(like="label_5") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.filter(items=keep_items) + s.filter(like="label_5") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "filter_series", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_get_set_option.py b/benchmarks/pandas/bench_get_set_option.py new file mode 100644 index 00000000..df9c675e --- /dev/null +++ b/benchmarks/pandas/bench_get_set_option.py @@ -0,0 +1,44 @@ +""" +Benchmark: get_option / set_option / reset_option — pandas options API. + +Mirrors tsb getOption / setOption / resetOption. +Outputs JSON: {"function": "get_set_option", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" + +import json +import time + +import pandas as pd + +WARMUP = 10 +ITERATIONS = 10_000 + +# Warm-up +for _ in range(WARMUP): + pd.get_option("display.max_rows") + pd.set_option("display.max_rows", 50) + pd.reset_option("display.max_rows") + pd.get_option("display.precision") + pd.set_option("display.precision", 3) + pd.reset_option("display.precision") + +start = time.perf_counter() +for i in range(ITERATIONS): + pd.get_option("display.max_rows") + pd.set_option("display.max_rows", (i % 90) + 10) + pd.reset_option("display.max_rows") + pd.get_option("display.precision") + pd.set_option("display.precision", (i % 8) + 2) + pd.reset_option("display.precision") +total_ms = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "get_set_option", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/pandas/bench_item_bool_extract.py b/benchmarks/pandas/bench_item_bool_extract.py new file mode 100644 index 00000000..39839448 --- /dev/null +++ b/benchmarks/pandas/bench_item_bool_extract.py @@ -0,0 +1,35 @@ +""" +Benchmark: Series.item() / bool(Series) / bool(DataFrame) — single-element scalar extraction. + +Mirrors tsb bench_item_bool_extract. +Outputs JSON: {"function": "item_bool_extract", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +WARMUP = 20 +ITERATIONS = 100_000 + +numeric_series = pd.Series([42.5]) +true_series = pd.Series([True]) +true_df = pd.DataFrame({"x": [True]}) + +for _ in range(WARMUP): + numeric_series.item() + bool(true_series) + bool(true_df) + +start = time.perf_counter() +for _ in range(ITERATIONS): + numeric_series.item() + bool(true_series) + bool(true_df) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "item_bool_extract", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_join_all.py b/benchmarks/pandas/bench_join_all.py new file mode 100644 index 00000000..040aa028 --- /dev/null +++ b/benchmarks/pandas/bench_join_all.py @@ -0,0 +1,30 @@ +"""Benchmark: join_all — sequential left-join of 4 DataFrames each with 5k rows""" +import json +import time +import pandas as pd + +N = 5_000 +WARMUP = 3 +ITERATIONS = 10 + +idx = [str(i) for i in range(N)] + +base = pd.DataFrame({"a": list(range(N))}, index=idx) +df1 = pd.DataFrame({"b": [i * 2 for i in range(N)]}, index=idx) +df2 = pd.DataFrame({"c": [i * 3 for i in range(N)]}, index=idx) +df3 = pd.DataFrame({"d": [i * 4 for i in range(N)]}, index=idx) + +for _ in range(WARMUP): + base.join([df1, df2, df3]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + base.join([df1, df2, df3]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "join_all", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_math_ops.py b/benchmarks/pandas/bench_math_ops.py new file mode 100644 index 00000000..1159ec02 --- /dev/null +++ b/benchmarks/pandas/bench_math_ops.py @@ -0,0 +1,35 @@ +"""Benchmark: math_ops — abs / round on Series and DataFrame of 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.where(np.arange(SIZE) % 2 == 0, -(np.arange(SIZE) + 0.567), np.arange(SIZE) + 0.567)) +df = pd.DataFrame({ + "a": -(np.arange(SIZE) + 0.123), + "b": np.arange(SIZE) + 0.456, +}) + +for _ in range(WARMUP): + s.abs() + df.abs() + s.round(1) + df.round(1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.abs() + df.abs() + s.round(1) + df.round(1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "math_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_merge_asof.py b/benchmarks/pandas/bench_merge_asof.py new file mode 100644 index 00000000..5517d2f8 --- /dev/null +++ b/benchmarks/pandas/bench_merge_asof.py @@ -0,0 +1,34 @@ +"""Benchmark: merge_asof — backward asof join of two 10k-row sorted DataFrames""" +import json +import time +import pandas as pd + +N = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +# Trades sorted by time: 0, 2, 4, ... +trade_times = list(range(0, N * 2, 2)) +prices = [100.0 + i * 0.5 for i in range(N)] + +# Quotes sorted by time, sparser: 0, 3, 6, ... +quote_times = list(range(0, N * 3, 3)) +bids = [99.0 + i * 0.5 for i in range(N)] + +trades = pd.DataFrame({"time": trade_times, "price": prices}) +quotes = pd.DataFrame({"time": quote_times, "bid": bids}) + +for _ in range(WARMUP): + pd.merge_asof(trades, quotes, on="time") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.merge_asof(trades, quotes, on="time") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "merge_asof", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_na_ops.py b/benchmarks/pandas/bench_na_ops.py new file mode 100644 index 00000000..b7d0adf0 --- /dev/null +++ b/benchmarks/pandas/bench_na_ops.py @@ -0,0 +1,42 @@ +"""Benchmark: na_ops — isna / notna / ffill / bfill on 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = pd.array([i if i % 5 != 0 else pd.NA for i in range(SIZE)], dtype="Int64") +s = pd.Series(data, dtype="float64") +s[np.arange(SIZE) % 5 == 0] = np.nan + +df = pd.DataFrame({ + "a": s, + "b": pd.Series([float(i * 2) if i % 7 != 0 else np.nan for i in range(SIZE)]), +}) + +for _ in range(WARMUP): + pd.isna(s) + pd.notna(s) + s.ffill() + s.bfill() + df.ffill() + df.bfill() + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.isna(s) + pd.notna(s) + s.ffill() + s.bfill() + df.ffill() + df.bfill() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "na_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_nanprod.py b/benchmarks/pandas/bench_nanprod.py new file mode 100644 index 00000000..ec5fcfda --- /dev/null +++ b/benchmarks/pandas/bench_nanprod.py @@ -0,0 +1,25 @@ +"""Benchmark: nanprod — product of array values ignoring NaN, via pd.Series.prod().""" +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = [None if i % 13 == 0 else 1 + (i % 7) * 0.0001 for i in range(SIZE)] +s = pd.Series(data, dtype=float) + +for _ in range(WARMUP): + s.prod(skipna=True) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.prod(skipna=True) + times.append(time.perf_counter() - t0) + +total = sum(times) +mean_ms = (total / ITERATIONS) * 1000 +total_ms = total * 1000 +print(f'{{"function": "nanprod", "mean_ms": {mean_ms:.6f}, "iterations": {ITERATIONS}, "total_ms": {total_ms:.6f}}}') diff --git a/benchmarks/pandas/bench_notna_boolean.py b/benchmarks/pandas/bench_notna_boolean.py new file mode 100644 index 00000000..96c0a59d --- /dev/null +++ b/benchmarks/pandas/bench_notna_boolean.py @@ -0,0 +1,36 @@ +"""Benchmark: notna_boolean — boolean-mask indexing on 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE)) +mask = pd.Series(np.arange(SIZE) % 2 == 0) +bool_arr = np.arange(SIZE) % 3 != 0 + +df = pd.DataFrame({ + "a": np.arange(SIZE), + "b": np.arange(SIZE) * 2, +}) + +for _ in range(WARMUP): + s[mask] + s[~mask] + df[bool_arr] + +start = time.perf_counter() +for _ in range(ITERATIONS): + s[mask] + s[~mask] + df[bool_arr] +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "notna_boolean", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_numeric_ops_log2_exp.py b/benchmarks/pandas/bench_numeric_ops_log2_exp.py new file mode 100644 index 00000000..89208443 --- /dev/null +++ b/benchmarks/pandas/bench_numeric_ops_log2_exp.py @@ -0,0 +1,52 @@ +""" +Benchmark: np.log2, np.log10, np.exp, np.sign applied to pandas Series and DataFrame. + +Mirrors tsb seriesLog2, seriesLog10, seriesExp, seriesSign and their DataFrame variants. +Uses 100k-row data to match the TypeScript benchmark. +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +# Positive values for log2/log10; any values for exp/sign +data = [(i + 1) * 0.1 for i in range(SIZE)] +s = pd.Series(data, dtype=float) +df = pd.DataFrame({ + "a": [(i + 1) * 0.1 for i in range(SIZE)], + "b": [(i + 1) * 0.2 for i in range(SIZE)], +}) + +# Warm-up +for _ in range(WARMUP): + np.log2(s) + np.log10(s) + np.exp(s) + np.sign(s) + np.log2(df) + np.log10(df) + np.exp(df) + np.sign(df) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.log2(s) + np.log10(s) + np.exp(s) + np.sign(s) + np.log2(df) + np.log10(df) + np.exp(df) + np.sign(df) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "numeric_ops_log2_exp", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_option_context.py b/benchmarks/pandas/bench_option_context.py new file mode 100644 index 00000000..6f6aa310 --- /dev/null +++ b/benchmarks/pandas/bench_option_context.py @@ -0,0 +1,33 @@ +""" +Benchmark: pd.describe_option() / pd.option_context() — pandas options describe and context manager. + +Mirrors tsb bench_option_context (describeOption + optionContext enter/exit). +Outputs JSON: {"function": "option_context", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +WARMUP = 20 +ITERATIONS = 50_000 + +for _ in range(WARMUP): + pd.describe_option("display.max_rows") + pd.describe_option("display.precision") + with pd.option_context("display.max_rows", 50, "display.precision", 3): + pass + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.describe_option("display.max_rows") + pd.describe_option("display.precision") + with pd.option_context("display.max_rows", 50, "display.precision", 3): + pass +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "option_context", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_pow_mod.py b/benchmarks/pandas/bench_pow_mod.py new file mode 100644 index 00000000..3458eb26 --- /dev/null +++ b/benchmarks/pandas/bench_pow_mod.py @@ -0,0 +1,34 @@ +"""Benchmark: Series.pow, Series.mod, DataFrame.pow on 100k rows""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = (np.arange(ROWS) % 100) + 1 +s = pd.Series(data.astype(float)) +df = pd.DataFrame({ + "a": ((np.arange(ROWS) % 100) + 1).astype(float), + "b": ((np.arange(ROWS) % 50) + 1).astype(float), +}) + +for _ in range(WARMUP): + s.pow(2) + s.mod(7) + df.pow(2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.pow(2) + s.mod(7) + df.pow(2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pow_mod", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_read_html.py b/benchmarks/pandas/bench_read_html.py new file mode 100644 index 00000000..03dd0199 --- /dev/null +++ b/benchmarks/pandas/bench_read_html.py @@ -0,0 +1,52 @@ +""" +Benchmark: pd.read_html — parse HTML tables into DataFrames. +Outputs JSON: {"function": "read_html", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import math + +try: + import pandas as pd +except ImportError: + import subprocess, sys + subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas", "--quiet"]) + import pandas as pd + +try: + import lxml # noqa: F401 +except ImportError: + import subprocess, sys + subprocess.check_call([sys.executable, "-m", "pip", "install", "lxml", "--quiet"]) + +ROWS = 1_000 +WARMUP = 3 +ITERATIONS = 20 + + +def build_html(rows: int) -> str: + header = "idnamevaluescore" + body_rows = [ + f"{i}item_{i % 100}{i * 1.5:.2f}{math.sin(i * 0.01):.6f}" + for i in range(rows) + ] + return f"{header}{''.join(body_rows)}
" + + +html = build_html(ROWS) + +# Warm-up +for _ in range(WARMUP): + pd.read_html(html) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.read_html(html) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "read_html", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_reduce_ops.py b/benchmarks/pandas/bench_reduce_ops.py new file mode 100644 index 00000000..2be36963 --- /dev/null +++ b/benchmarks/pandas/bench_reduce_ops.py @@ -0,0 +1,37 @@ +"""Benchmark: reduce_ops — nunique / any / all on Series and DataFrame of 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) % 1000) +bool_s = pd.Series(np.arange(SIZE) > 0) +df = pd.DataFrame({ + "a": np.arange(SIZE) % 500, + "b": np.arange(SIZE) % 200, + "c": np.arange(SIZE) % 100, +}) + +for _ in range(WARMUP): + s.nunique() + bool_s.any() + bool_s.all() + df.nunique() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.nunique() + bool_s.any() + bool_s.all() + df.nunique() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "reduce_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_rename_ops.py b/benchmarks/pandas/bench_rename_ops.py new file mode 100644 index 00000000..897f520b --- /dev/null +++ b/benchmarks/pandas/bench_rename_ops.py @@ -0,0 +1,36 @@ +"""Benchmark: rename_ops — rename / add_prefix / add_suffix on Series/DataFrame of 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE), index=[f"row_{i}" for i in range(SIZE)]) +df = pd.DataFrame({ + "col_a": np.arange(SIZE), + "col_b": np.arange(SIZE) * 2, + "col_c": np.arange(SIZE) * 3, +}) + +for _ in range(WARMUP): + s.rename(lambda lbl: f"new_{lbl}") + df.rename(columns={"col_a": "a", "col_b": "b"}) + df.add_prefix("pre_") + df.add_suffix("_suf") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rename(lambda lbl: f"new_{lbl}") + df.rename(columns={"col_a": "a", "col_b": "b"}) + df.add_prefix("pre_") + df.add_suffix("_suf") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "rename_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_resample_dataframe.py b/benchmarks/pandas/bench_resample_dataframe.py new file mode 100644 index 00000000..da5b555b --- /dev/null +++ b/benchmarks/pandas/bench_resample_dataframe.py @@ -0,0 +1,45 @@ +""" +Benchmark: DataFrame resampling with multiple aggregations. + +The existing resample benchmark only covers Series. This exercises +df.resample("1h").mean() / .sum() / .min() on a multi-column datetime-indexed DataFrame. +Mirrors tsb resampleDataFrame(df, "H").mean() / .sum() / .min(). + +Outputs JSON: {"function": "resample_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 3 +ITERATIONS = 30 + +idx = pd.date_range("2020-01-01", periods=SIZE, freq="1min") +rng = np.random.default_rng(42) + +df = pd.DataFrame({ + "a": np.sin(np.arange(SIZE) * 0.01) * 50 + 50, + "b": np.cos(np.arange(SIZE) * 0.02) * 30 + 30, + "c": (np.arange(SIZE) % 100) * 1.5, +}, index=idx) + +for _ in range(WARMUP): + df.resample("1h").mean() + df.resample("1h").sum() + df.resample("1h").min() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.resample("1h").mean() + df.resample("1h").sum() + df.resample("1h").min() +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "resample_dataframe", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_series_compare_pair.py b/benchmarks/pandas/bench_series_compare_pair.py new file mode 100644 index 00000000..dbbb2043 --- /dev/null +++ b/benchmarks/pandas/bench_series_compare_pair.py @@ -0,0 +1,39 @@ +""" +Benchmark: pandas Series-to-Series comparison operations. + +Mirrors tsb seriesNe(a, b), seriesGt(a, b), seriesLe(a, b), seriesEq(a, b). +The existing compare benchmark tests scalar comparison; this tests Series-to-Series. +Uses 100k-element Series to match the TypeScript benchmark. +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 100 + +a = pd.Series([(i * 1.7) % 1000 for i in range(SIZE)], dtype=float) +b = pd.Series([(i * 2.3) % 1000 for i in range(SIZE)], dtype=float) + +# Warm-up +for _ in range(WARMUP): + a.ne(b) + a.gt(b) + a.le(b) + a.eq(b) + +start = time.perf_counter() +for _ in range(ITERATIONS): + a.ne(b) + a.gt(b) + a.le(b) + a.eq(b) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_compare_pair", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_series_dot_dataframe.py b/benchmarks/pandas/bench_series_dot_dataframe.py new file mode 100644 index 00000000..59f85f24 --- /dev/null +++ b/benchmarks/pandas/bench_series_dot_dataframe.py @@ -0,0 +1,38 @@ +""" +Benchmark: pd.Series.dot(DataFrame) and pd.DataFrame.dot(Series) — cross-form dot products. + +Mirrors tsb seriesDotDataFrame and dataFrameDotSeries. +Dataset: 1000-element Series, 1000-row × 20-column DataFrame. +Outputs JSON: {"function": "series_dot_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +N = 1_000 +K = 20 +WARMUP = 5 +ITERATIONS = 50 + +s_data = [(i + 1) * 0.01 for i in range(N)] +s = pd.Series(s_data) + +cols = {f"c{c}": [(i * K + c) * 0.001 for i in range(N)] for c in range(K)} +df = pd.DataFrame(cols) + +for _ in range(WARMUP): + s.dot(df) + df.dot(s) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dot(df) + df.dot(s) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_dot_dataframe", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_series_format_table.py b/benchmarks/pandas/bench_series_format_table.py new file mode 100644 index 00000000..48abadd1 --- /dev/null +++ b/benchmarks/pandas/bench_series_format_table.py @@ -0,0 +1,42 @@ +""" +Benchmark: pandas Series.to_markdown() and Series.to_latex() on a 500-element Series. + +Mirrors the tsb seriesToMarkdown and seriesToLaTeX benchmark. +Exercises table-rendering of both numeric and string series. +""" +import json +import time +import math +import pandas as pd + +N = 500 +WARMUP = 3 +ITERATIONS = 30 + +num_data = [math.sin(i * 0.05) * 100 for i in range(N)] +str_data = [None if i % 10 == 0 else f"item_{i}" for i in range(N)] + +num_series = pd.Series(num_data) +str_series = pd.Series(str_data) + +# Warm-up +for _ in range(WARMUP): + num_series.to_markdown() + num_series.to_latex() + str_series.to_markdown() + str_series.to_latex() + +start = time.perf_counter() +for _ in range(ITERATIONS): + num_series.to_markdown() + num_series.to_latex() + str_series.to_markdown() + str_series.to_latex() +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_format_table", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_series_items_iter.py b/benchmarks/pandas/bench_series_items_iter.py new file mode 100644 index 00000000..bba399eb --- /dev/null +++ b/benchmarks/pandas/bench_series_items_iter.py @@ -0,0 +1,28 @@ +"""Benchmark: Series.items() / Series.iteritems() — iterate over (label, value) pairs.""" +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series( + data=[i * 1.1 for i in range(SIZE)], + index=[f"row_{i}" for i in range(SIZE)], +) + +for _ in range(WARMUP): + for _pair in s.items(): + pass + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for _pair in s.items(): + pass + times.append(time.perf_counter() - t0) + +total = sum(times) +mean_ms = (total / ITERATIONS) * 1000 +total_ms = total * 1000 +print(f'{{"function": "series_items_iter", "mean_ms": {mean_ms:.6f}, "iterations": {ITERATIONS}, "total_ms": {total_ms:.6f}}}') diff --git a/benchmarks/pandas/bench_series_setaxis_toframe.py b/benchmarks/pandas/bench_series_setaxis_toframe.py new file mode 100644 index 00000000..d23537b1 --- /dev/null +++ b/benchmarks/pandas/bench_series_setaxis_toframe.py @@ -0,0 +1,56 @@ +""" +Benchmark: Series.to_frame() / Series.set_axis() / DataFrame.set_axis() / + Series.add_prefix() / Series.add_suffix() + +Mirrors tsb bench_series_setaxis_toframe. +Dataset: 50 000-element numeric Series; 50 000-row × 3-column DataFrame. +Outputs JSON: {"function": "series_setaxis_toframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +data = [i * 1.5 for i in range(SIZE)] +idx = [f"r{i}" for i in range(SIZE)] +new_idx = [f"row_{i}" for i in range(SIZE)] + +s = pd.Series(data, index=idx, name="values") +df = pd.DataFrame( + { + "a": list(range(SIZE)), + "b": [i * 2 for i in range(SIZE)], + "c": [i * 3 for i in range(SIZE)], + }, + index=idx, +) +new_cols = ["col_a", "col_b", "col_c"] + +for _ in range(WARMUP): + s.to_frame() + s.set_axis(new_idx) + df.set_axis(new_idx, axis=0) + df.set_axis(new_cols, axis=1) + s.add_prefix("pre_") + s.add_suffix("_suf") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.to_frame() + s.set_axis(new_idx) + df.set_axis(new_idx, axis=0) + df.set_axis(new_cols, axis=1) + s.add_prefix("pre_") + s.add_suffix("_suf") +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_setaxis_toframe", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_series_to_markdown.py b/benchmarks/pandas/bench_series_to_markdown.py new file mode 100644 index 00000000..e219f33e --- /dev/null +++ b/benchmarks/pandas/bench_series_to_markdown.py @@ -0,0 +1,32 @@ +""" +Benchmark: Series.to_markdown() and Series.to_latex() on a 500-element numeric Series. + +Mirrors tsb seriesToMarkdown and seriesToLaTeX. +Outputs JSON: {"function": "series_to_markdown", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 500 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([(i * 1.7) % 100 for i in range(SIZE)], name="values") + +for _ in range(WARMUP): + s.to_markdown() + s.to_latex() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.to_markdown() + s.to_latex() +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_to_markdown", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_shift_diff.py b/benchmarks/pandas/bench_shift_diff.py new file mode 100644 index 00000000..878d05c6 --- /dev/null +++ b/benchmarks/pandas/bench_shift_diff.py @@ -0,0 +1,28 @@ +"""Benchmark: Series.shift and Series.diff on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.arange(ROWS, dtype=float) * 1.5 +s = pd.Series(data) + +for _ in range(WARMUP): + s.shift(1) + s.diff(1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.shift(1) + s.diff(1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "shift_diff", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_sort_ops.py b/benchmarks/pandas/bench_sort_ops.py new file mode 100644 index 00000000..929558f3 --- /dev/null +++ b/benchmarks/pandas/bench_sort_ops.py @@ -0,0 +1,32 @@ +"""Benchmark: Series.sort_values and DataFrame.sort_values on 100k rows""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS, dtype=float)) * 1000 +s = pd.Series(data) +df = pd.DataFrame({ + "a": np.sin(np.arange(ROWS, dtype=float)) * 1000, + "b": np.cos(np.arange(ROWS, dtype=float)) * 500, +}) + +for _ in range(WARMUP): + s.sort_values() + df.sort_values("a") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.sort_values() + df.sort_values("a") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "sort_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_findall_expand.py b/benchmarks/pandas/bench_str_findall_expand.py new file mode 100644 index 00000000..54bf92fb --- /dev/null +++ b/benchmarks/pandas/bench_str_findall_expand.py @@ -0,0 +1,36 @@ +""" +Benchmark: pandas Series.str.extract() with named capture groups on a 5k-element Series. + +Mirrors the tsb strFindallExpand benchmark. +Each string has the form "userN scoreM levelL" and the regex extracts +named groups: word, num, score, level. +""" +import json +import time +import pandas as pd + +N = 5_000 +WARMUP = 3 +ITERATIONS = 20 + +data = [None if i % 20 == 0 else f"user{i} score{(i * 7) % 100} level{(i % 5) + 1}" for i in range(N)] +s = pd.Series(data, dtype="object") + +# Named capture-group pattern matching the TypeScript version +pat = r"(?P[a-z]+)(?P\d+)\s+score(?P\d+)\s+level(?P\d+)" + +# Warm-up +for _ in range(WARMUP): + s.str.extract(pat) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.extract(pat) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_findall_expand", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_styler_format.py b/benchmarks/pandas/bench_styler_format.py new file mode 100644 index 00000000..a3c98e22 --- /dev/null +++ b/benchmarks/pandas/bench_styler_format.py @@ -0,0 +1,59 @@ +"""Benchmark: Styler.format / apply / applymap / to_html — Styler formatting chain on 100 rows. + +Mirrors tsb Styler: format / formatIndex / apply / applymap / toHtml. +""" +import json +import time +import numpy as np +import pandas as pd + +ROWS = 100 +WARMUP = 3 +ITERATIONS = 20 + + +df = pd.DataFrame( + { + "a": np.arange(ROWS) * 1.5, + "b": np.arange(ROWS, 0, -1) * 2.0, + "c": np.sin(np.arange(ROWS) / 10) * 50 + 50, + } +) + + +def _apply_red(vals): + return ["color: navy"] * len(vals) + + +def _applymap_bold(v): + return "font-weight: bold" if isinstance(v, float) and v > 50 else "" + + +def _run(): + styler = df.style.format("{:.2f}").apply(_apply_red) + try: + # pandas 2.1+ renamed applymap → map + styler = styler.map(_applymap_bold) + except AttributeError: + styler = styler.applymap(_applymap_bold) + styler.to_html() + + +for _ in range(WARMUP): + _run() + +start = time.perf_counter() +for _ in range(ITERATIONS): + _run() +total_ms = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "styler_format", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/pandas/bench_styler_highlight_adv.py b/benchmarks/pandas/bench_styler_highlight_adv.py new file mode 100644 index 00000000..d2eb702e --- /dev/null +++ b/benchmarks/pandas/bench_styler_highlight_adv.py @@ -0,0 +1,54 @@ +"""Benchmark: Styler advanced — highlight_null / highlight_between / text_gradient / +bar / set_caption / to_latex on 100 rows. + +Mirrors tsb Styler: highlightNull / highlightBetween / textGradient / barChart / +setCaption / toLatex. +""" +import json +import time +import warnings +import numpy as np +import pandas as pd + +ROWS = 100 +WARMUP = 3 +ITERATIONS = 20 + +a_data = np.arange(ROWS, dtype=float) +b_data = np.where(np.arange(ROWS) % 10 == 0, np.nan, np.arange(ROWS) * 2.0) +c_data = np.sin(np.arange(ROWS) / 10) * 50 + 50 + +df = pd.DataFrame({"a": a_data, "b": b_data, "c": c_data}) + + +def _run(): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + ( + df.style.highlight_null(color="red") + .highlight_between(left=20, right=80, color="lightyellow") + .text_gradient(cmap="Blues") + .bar(align="mid", color="#aec6cf") + .set_caption("Benchmark Table") + .to_latex() + ) + + +for _ in range(WARMUP): + _run() + +start = time.perf_counter() +for _ in range(ITERATIONS): + _run() +total_ms = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "styler_highlight_adv", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/pandas/bench_styler_table_props.py b/benchmarks/pandas/bench_styler_table_props.py new file mode 100644 index 00000000..4d9b6e42 --- /dev/null +++ b/benchmarks/pandas/bench_styler_table_props.py @@ -0,0 +1,68 @@ +"""Benchmark: Styler table-level configuration — set_properties / set_table_styles / +set_table_attributes / hide / set_precision / set_na_rep / clear / to_html. + +Mirrors tsb Styler: setProperties / setTableStyles / setTableAttributes / +hide / setPrecision / setNaRep / clearStyles / toHtml. +""" +import json +import time +import warnings +import numpy as np +import pandas as pd + +ROWS = 100 +WARMUP = 3 +ITERATIONS = 20 + +a_data = np.arange(ROWS, dtype=float) * 1.5 +b_data = np.where(np.arange(ROWS) % 10 == 0, np.nan, np.arange(ROWS) * 2.0) +c_data = np.sin(np.arange(ROWS) / 10) * 50 + 50 + +df = pd.DataFrame({"a": a_data, "b": b_data, "c": c_data}) + + +def _run(): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + ( + df.style.set_precision(3) + .set_na_rep("\u2014") + .set_properties(subset=["a", "b"], **{"font-size": "12px", "color": "navy"}) + .set_table_styles( + [ + { + "selector": "th", + "props": [("background-color", "#4a90d9"), ("color", "white")], + }, + { + "selector": "tr:nth-child(even) td", + "props": [("background-color", "#f5f5f5")], + }, + ] + ) + .set_table_attributes('class="data-table" id="bench-table"') + .hide(axis="index") + .hide(subset=["c"], axis="columns") + .clear() + .to_html() + ) + + +for _ in range(WARMUP): + _run() + +start = time.perf_counter() +for _ in range(ITERATIONS): + _run() +total_ms = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "styler_table_props", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/pandas/bench_to_json_denormalize.py b/benchmarks/pandas/bench_to_json_denormalize.py new file mode 100644 index 00000000..ae51decf --- /dev/null +++ b/benchmarks/pandas/bench_to_json_denormalize.py @@ -0,0 +1,41 @@ +"""Benchmark: to_json_denormalize — json orient variants on 10k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +# DataFrame matching the tsb benchmark (nested-structure-like columns) +df = pd.DataFrame({ + "name": [f"user_{i}" for i in range(ROWS)], + "address.city": [f"city_{i % 100}" for i in range(ROWS)], + "address.zip": [str(10000 + (i % 9000)) for i in range(ROWS)], + "score": np.arange(ROWS) * 0.01, +}) + +for _ in range(WARMUP): + # pandas equivalent of toJsonDenormalize: to_dict("records") then reconstruct nesting + recs = df.to_dict("records") + # pandas equivalent of toJsonRecords: orient="records" + df.to_json(orient="records") + # pandas equivalent of toJsonSplit: orient="split" + df.to_json(orient="split") + # pandas equivalent of toJsonIndex: orient="index" + df.to_json(orient="index") + +start = time.perf_counter() +for _ in range(ITERATIONS): + recs = df.to_dict("records") + df.to_json(orient="records") + df.to_json(orient="split") + df.to_json(orient="index") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "to_json_denormalize", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_to_latex.py b/benchmarks/pandas/bench_to_latex.py new file mode 100644 index 00000000..c8289694 --- /dev/null +++ b/benchmarks/pandas/bench_to_latex.py @@ -0,0 +1,44 @@ +"""Benchmark: toLaTeX / seriesToLaTeX — DataFrame.to_latex() and Series.to_latex() on 500 rows. + +Mirrors tsb toLaTeX(df) / seriesToLaTeX(s) from src/stats/format_table.ts. +""" +import json +import time +import numpy as np +import pandas as pd + +ROWS = 500 +WARMUP = 5 +ITERATIONS = 100 + +df = pd.DataFrame( + { + "name": [f"item_{i}" for i in range(ROWS)], + "value": np.arange(ROWS) * 1.23, + "count": np.arange(ROWS, dtype=float), + } +) +s = pd.Series(np.arange(ROWS) * 0.5) + +for _ in range(WARMUP): + df.to_latex() + df.to_latex(index=False) + s.to_latex() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.to_latex() + df.to_latex(index=False) + s.to_latex() +total_ms = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "to_latex", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/pandas/bench_truncate_df.py b/benchmarks/pandas/bench_truncate_df.py new file mode 100644 index 00000000..4f8b0c2a --- /dev/null +++ b/benchmarks/pandas/bench_truncate_df.py @@ -0,0 +1,31 @@ +"""Benchmark: DataFrame.truncate — slice rows by before/after on 100k-row DataFrame""" +import json +import time +import pandas as pd +import numpy as np + +N = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +index = list(range(N)) +df = pd.DataFrame({ + "a": np.arange(N, dtype=float), + "b": np.arange(N, dtype=float) * 2, + "c": np.arange(N, dtype=float) * 3, +}, index=index) + +for _ in range(WARMUP): + df.truncate(before=10_000, after=90_000) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.truncate(before=10_000, after=90_000) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "truncate_df", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_value_counts_full.py b/benchmarks/pandas/bench_value_counts_full.py new file mode 100644 index 00000000..284bb8ed --- /dev/null +++ b/benchmarks/pandas/bench_value_counts_full.py @@ -0,0 +1,28 @@ +"""Benchmark: value_counts_full — value_counts(bins=N) on Series of 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +rng = np.random.default_rng(42) +s = pd.Series(rng.random(SIZE) * 100) + +for _ in range(WARMUP): + s.value_counts(bins=10) + s.value_counts(bins=20) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.value_counts(bins=10) + s.value_counts(bins=20) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "value_counts_full", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_window_extended.py b/benchmarks/pandas/bench_window_extended.py new file mode 100644 index 00000000..ddafc28a --- /dev/null +++ b/benchmarks/pandas/bench_window_extended.py @@ -0,0 +1,32 @@ +"""Benchmark: window_extended — rolling sem/skew/kurt/quantile on 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 +WINDOW = 10 + +s = pd.Series(np.sin(np.arange(SIZE) / 100) * 100 + np.arange(SIZE) * 0.001) + +for _ in range(WARMUP): + s.rolling(WINDOW).sem() + s.rolling(WINDOW).skew() + s.rolling(WINDOW).kurt() + s.rolling(WINDOW).quantile(0.5) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rolling(WINDOW).sem() + s.rolling(WINDOW).skew() + s.rolling(WINDOW).kurt() + s.rolling(WINDOW).quantile(0.5) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "window_extended", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_window_indexers.py b/benchmarks/pandas/bench_window_indexers.py new file mode 100644 index 00000000..0c3b32f1 --- /dev/null +++ b/benchmarks/pandas/bench_window_indexers.py @@ -0,0 +1,60 @@ +""" +Benchmark: FixedForwardWindowIndexer and custom variable-offset BaseIndexer via rolling. + +Mirrors tsb FixedForwardWindowIndexer, VariableOffsetWindowIndexer, and applyIndexer. +Uses a 50k-row Series. Each iteration: +- Applies rolling(FixedForwardWindowIndexer(window_size=5)).sum() (forward-looking). +- Applies rolling(custom IntegerOffsetIndexer).sum() (variable look-back, mirrors tsb). +Outputs JSON: {"function": "window_indexers", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd +from pandas.api.indexers import BaseIndexer, FixedForwardWindowIndexer + + +class IntegerOffsetIndexer(BaseIndexer): + """Variable look-back window where each row uses a per-row integer offset.""" + + def __init__(self, offsets): + super().__init__() + self._offsets = offsets + + def get_window_bounds(self, num_values=0, min_periods=None, center=None, closed=None, step=1): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + offset = self._offsets[i % len(self._offsets)] + start[i] = max(0, i - offset) + end[i] = i + 1 + return start, end + + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +values = [(i * 0.1) % 100 for i in range(SIZE)] +s = pd.Series(values) + +fwd_indexer = FixedForwardWindowIndexer(window_size=5) +offsets = [(i % 10) + 1 for i in range(SIZE)] +var_indexer = IntegerOffsetIndexer(offsets=offsets) + +for _ in range(WARMUP): + s.rolling(fwd_indexer).sum() + s.rolling(var_indexer).sum() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rolling(fwd_indexer).sum() + s.rolling(var_indexer).sum() +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "window_indexers", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_xs_series.py b/benchmarks/pandas/bench_xs_series.py new file mode 100644 index 00000000..41dab0aa --- /dev/null +++ b/benchmarks/pandas/bench_xs_series.py @@ -0,0 +1,55 @@ +""" +Benchmark: Series.xs() — cross-section lookup on Series. + +Mirrors tsb xsSeries. +Tests flat-index lookup (returns scalar) and MultiIndex lookup (returns sub-Series). +Outputs JSON: {"function": "xs_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" + +import json +import time + +import pandas as pd + +N = 1_000 +WARMUP = 10 +ITERATIONS = 5_000 + +# Flat-index Series: each key appears once → xs returns a scalar. +flat_series = pd.Series( + [i * 1.5 for i in range(N)], + index=[f"k{i}" for i in range(N)], + name="flat", +) + +# MultiIndex Series: 10 outer keys × 100 inner keys → xs returns a sub-Series (100 rows). +outer_keys = [f"g{i // 100}" for i in range(N)] +inner_keys = [i % 100 for i in range(N)] +multi_index = pd.MultiIndex.from_arrays([outer_keys, inner_keys], names=["outer", "inner"]) +multi_series = pd.Series( + [i * 2.0 for i in range(N)], + index=multi_index, + name="multi", +) + +# Warm-up +for i in range(WARMUP): + flat_series.xs(f"k{i % N}") + multi_series.xs(f"g{i % 10}") + +start = time.perf_counter() +for i in range(ITERATIONS): + flat_series.xs(f"k{i % N}") + multi_series.xs(f"g{i % 10}") +total_ms = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "xs_series", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/tsb/bench_at_iat.ts b/benchmarks/tsb/bench_at_iat.ts new file mode 100644 index 00000000..ed33ba07 --- /dev/null +++ b/benchmarks/tsb/bench_at_iat.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: seriesAt, seriesIat, dataFrameAt, dataFrameIat — fast scalar access + * Outputs JSON: {"function": "at_iat", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, seriesAt, seriesIat, dataFrameAt, dataFrameIat } from "../../src/index.ts"; + +const N = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const labels = Array.from({ length: N }, (_, i) => `r${i}`); +const values = Array.from({ length: N }, (_, i) => i * 1.5); + +const s = new Series({ data: values, index: labels }); +const df = DataFrame.fromColumns( + { a: values, b: values.map((v) => v * 2) }, + { index: labels }, +); + +const midLabel = `r${Math.floor(N / 2)}`; + +for (let i = 0; i < WARMUP; i++) { + seriesAt(s, midLabel); + seriesIat(s, N / 2); + dataFrameAt(df, midLabel, "a"); + dataFrameIat(df, N / 2, 0); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesAt(s, midLabel); + seriesIat(s, N / 2); + dataFrameAt(df, midLabel, "a"); + dataFrameIat(df, N / 2, 0); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "at_iat", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_autocorr.ts b/benchmarks/tsb/bench_autocorr.ts new file mode 100644 index 00000000..97c62646 --- /dev/null +++ b/benchmarks/tsb/bench_autocorr.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: autoCorr — lag-N autocorrelation for a 100k-element numeric Series. + * + * Mirrors pandas Series.autocorr(lag). + * Benchmarks lag=1, lag=5, and lag=20. + * + * Outputs JSON: {"function": "autocorr", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, autoCorr } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// A sinusoidal signal with some noise for a non-trivial autocorrelation. +const data = Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.05) + (i % 7) * 0.01); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + autoCorr(s, 1); + autoCorr(s, 5); + autoCorr(s, 20); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + autoCorr(s, 1); + autoCorr(s, 5); + autoCorr(s, 20); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "autocorr", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_convert_dtypes.ts b/benchmarks/tsb/bench_convert_dtypes.ts new file mode 100644 index 00000000..2ba7f4d3 --- /dev/null +++ b/benchmarks/tsb/bench_convert_dtypes.ts @@ -0,0 +1,53 @@ +/** + * Benchmark: convertDtypesSeries and convertDtypesDataFrame + * + * Mirrors pandas Series.convert_dtypes() and DataFrame.convert_dtypes(). + * Creates a 50k-row dataset with object-typed numeric, boolean, and string + * columns, then measures how fast tsb can infer and convert to best dtypes. + */ +import { Series, DataFrame, convertDtypesSeries, convertDtypesDataFrame } from "../../src/index.ts"; +import type { Scalar } from "../../src/types.ts"; + +const N = 50_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Object-dtype series: integers stored as Scalars (no typed array) +const intData: Scalar[] = Array.from({ length: N }, (_, i) => (i % 17 === 0 ? null : i)); +const floatData: Scalar[] = Array.from({ length: N }, (_, i) => (i % 13 === 0 ? null : i * 1.5)); +const strData: Scalar[] = Array.from({ length: N }, (_, i) => (i % 11 === 0 ? null : `str_${i}`)); +const boolData: Scalar[] = Array.from({ length: N }, (_, i) => (i % 7 === 0 ? null : i % 2 === 0)); + +const intSeries = new Series({ data: intData }); +const floatSeries = new Series({ data: floatData }); + +const df = DataFrame.fromColumns({ + int_col: intData, + float_col: floatData, + str_col: strData, + bool_col: boolData, +}); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + convertDtypesSeries(intSeries); + convertDtypesSeries(floatSeries); + convertDtypesDataFrame(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + convertDtypesSeries(intSeries); + convertDtypesSeries(floatSeries); + convertDtypesDataFrame(df); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "convert_dtypes", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_cross_join.ts b/benchmarks/tsb/bench_cross_join.ts new file mode 100644 index 00000000..0bdf02fb --- /dev/null +++ b/benchmarks/tsb/bench_cross_join.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: crossJoin — Cartesian product of two 300-row DataFrames (90k result rows). + * Outputs JSON: {"function": "cross_join", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, crossJoin } from "../../src/index.ts"; + +const N = 300; +const WARMUP = 3; +const ITERATIONS = 10; + +// Distinct column names so no suffix needed +const left = DataFrame.fromColumns({ + id_a: Array.from({ length: N }, (_, i) => i), + val_a: Array.from({ length: N }, (_, i) => i * 1.5), +}); +const right = DataFrame.fromColumns({ + id_b: Array.from({ length: N }, (_, i) => i), + val_b: Array.from({ length: N }, (_, i) => i * 2.5), +}); + +for (let i = 0; i < WARMUP; i++) { + crossJoin(left, right); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + crossJoin(left, right); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cross_join", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cut_bins_to_frame.ts b/benchmarks/tsb/bench_cut_bins_to_frame.ts new file mode 100644 index 00000000..135fcd91 --- /dev/null +++ b/benchmarks/tsb/bench_cut_bins_to_frame.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: cut_bins_to_frame — cutBinsToFrame / cutBinCounts / binEdges on 100k data points. + * Outputs JSON: {"function": "cut_bins_to_frame", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { cut, cutBinsToFrame, cutBinCounts, binEdges } from "../../src/index.ts"; + +const SIZE = 100_000; +const NUM_BINS = 20; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => (i % 1000) * 0.1); +const binResult = cut(data, NUM_BINS); + +for (let i = 0; i < WARMUP; i++) { + cutBinsToFrame(binResult, { data }); + cutBinCounts(binResult); + binEdges(binResult); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + cutBinsToFrame(binResult, { data }); + cutBinCounts(binResult); + binEdges(binResult); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cut_bins_to_frame", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_compare_pair.ts b/benchmarks/tsb/bench_dataframe_compare_pair.ts new file mode 100644 index 00000000..8cdbe042 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_compare_pair.ts @@ -0,0 +1,58 @@ +/** + * Benchmark: DataFrame-to-DataFrame element-wise comparisons. + * + * The existing `dataframe_compare` benchmark only tests scalar comparisons (df vs 50). + * This benchmark tests DataFrame-to-DataFrame element-wise comparisons: + * dataFrameEq(df1, df2), dataFrameNe(df1, df2), dataFrameGt(df1, df2), dataFrameLe(df1, df2). + * Mirrors pandas df1.eq(df2), df1.ne(df2), df1.gt(df2), df1.le(df2). + * + * Outputs JSON: {"function": "dataframe_compare_pair", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + DataFrame, + dataFrameEq, + dataFrameNe, + dataFrameGt, + dataFrameLe, +} from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df1 = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i * 1.7) % 1000), + b: Array.from({ length: SIZE }, (_, i) => (i * 2.3) % 1000), + c: Array.from({ length: SIZE }, (_, i) => i % 100), +}); + +const df2 = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i * 2.1) % 1000), + b: Array.from({ length: SIZE }, (_, i) => (i * 1.9) % 1000), + c: Array.from({ length: SIZE }, (_, i) => (i + 7) % 100), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameEq(df1, df2); + dataFrameNe(df1, df2); + dataFrameGt(df1, df2); + dataFrameLe(df1, df2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameEq(df1, df2); + dataFrameNe(df1, df2); + dataFrameGt(df1, df2); + dataFrameLe(df1, df2); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_compare_pair", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_itertuples.ts b/benchmarks/tsb/bench_dataframe_itertuples.ts new file mode 100644 index 00000000..b1500b18 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_itertuples.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: DataFrame.itertuples() — iterate over rows as record objects. + * Outputs JSON: {"function": "dataframe_itertuples", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const ROWS = 1_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = DataFrame.fromColumns({ + x: Array.from({ length: ROWS }, (_, i) => i * 1.5), + y: Array.from({ length: ROWS }, (_, i) => i * 2.5), + z: Array.from({ length: ROWS }, (_, i) => i * 3.5), +}); + +for (let i = 0; i < WARMUP; i++) { + for (const _row of df.itertuples()) { + /* warm up */ + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + for (const _row of df.itertuples()) { + /* iterate */ + } + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "dataframe_itertuples", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_transform_named.ts b/benchmarks/tsb/bench_dataframe_transform_named.ts new file mode 100644 index 00000000..d45ab0f1 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_transform_named.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: dataFrameTransform with named aggregation strings. + * + * Mirrors pandas DataFrame.transform(["sum", "mean", "cumsum"]) which applies + * multiple aggregation functions per column. Tests the string-name form of + * dataFrameTransform from stats/transform_agg.ts. + * + * Outputs JSON: {"function": "dataframe_transform_named", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameTransform } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const a = Array.from({ length: ROWS }, (_, i) => (i % 100) * 1.5 + 1); +const b = Array.from({ length: ROWS }, (_, i) => ((i * 3) % 200) * 0.5 + 2); +const c = Array.from({ length: ROWS }, (_, i) => ((i * 7) % 50) * 2.0 + 0.5); +const df = DataFrame.fromColumns({ a, b, c }); + +// Warm-up: single-string transform and array-of-strings transform +for (let i = 0; i < WARMUP; i++) { + dataFrameTransform(df, "mean"); + dataFrameTransform(df, "cumsum"); + dataFrameTransform(df, ["sum", "mean"] as const); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameTransform(df, "mean"); + dataFrameTransform(df, "cumsum"); + dataFrameTransform(df, ["sum", "mean"] as const); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_transform_named", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_update.ts b/benchmarks/tsb/bench_dataframe_update.ts new file mode 100644 index 00000000..eaacbe9d --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_update.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: dataFrameUpdate — in-place-style DataFrame value update. + * + * Mirrors pandas `DataFrame.update()`. + * Overwrites non-null values from `other` into `self`. + * Outputs JSON: {"function": "dataframe_update", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, dataFrameUpdate } from "../../src/index.ts"; + +const N = 10_000; +const WARMUP = 20; +const ITERATIONS = 200; + +// Build two DataFrames; `other` has null in ~2/3 of rows (so 1/3 rows are updated). +const aData = Array.from({ length: N }, (_, i) => i * 1.0); +const bData = Array.from({ length: N }, (_, i) => i * 2.0); + +const aOther = Array.from({ length: N }, (_, i) => + i % 3 === 0 ? i * 10.0 : (null as unknown as number), +); +const bOther = Array.from({ length: N }, (_, i) => + i % 3 === 0 ? i * 20.0 : (null as unknown as number), +); + +const df = new DataFrame({ a: aData, b: bData }); +const other = new DataFrame({ a: aOther, b: bOther }); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + dataFrameUpdate(df, other); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameUpdate(df, other); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_update", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms: total_ms, + }), +); diff --git a/benchmarks/tsb/bench_errors.ts b/benchmarks/tsb/bench_errors.ts new file mode 100644 index 00000000..fffbef27 --- /dev/null +++ b/benchmarks/tsb/bench_errors.ts @@ -0,0 +1,56 @@ +/** + * Benchmark: pd.errors namespace — instantiate and inspect pandas-compatible error classes. + * + * Covers the `errors` namespace from tsb: + * - errors.ValueError, errors.KeyError, errors.IndexError (base classes) + * - errors.EmptyDataError, errors.MergeError, errors.OptionError + * - errors.IntCastingNaNError, errors.UnsortedIndexError + * - errors.ParserError, errors.PerformanceWarning, errors.InvalidIndexError + * - instanceof checks and .name/.message property access + * + * Outputs JSON: {"function": "errors", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { errors } from "../../src/index.ts"; + +const WARMUP = 5; +const ITERATIONS = 200; + +function run(): void { + const e1 = new errors.ValueError("bad value"); + const e2 = new errors.KeyError("missing key"); + const e3 = new errors.MergeError("incompatible merge"); + const e4 = new errors.EmptyDataError("no data"); + const e5 = new errors.OptionError("unknown option"); + const e6 = new errors.IntCastingNaNError(); + const e7 = new errors.UnsortedIndexError(); + const e8 = new errors.ParserError("unexpected token"); + const e9 = new errors.PerformanceWarning("slow path"); + const e10 = new errors.InvalidIndexError("bad index"); + + const _a = e1 instanceof errors.ValueError; + const _b = e2 instanceof errors.KeyError; + const _c = e3 instanceof Error; + const _d = e4.name === "EmptyDataError"; + const _e = e5.message.includes("unknown"); + const _f = e6 instanceof errors.IntCastingNaNError; + const _g = e7 instanceof errors.UnsortedIndexError; + const _h = e8.name === "ParserError"; + const _i = e9.name === "PerformanceWarning"; + const _j = e10 instanceof errors.InvalidIndexError; + void [_a, _b, _c, _d, _e, _f, _g, _h, _i, _j]; +} + +for (let i = 0; i < WARMUP; i++) run(); + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) run(); +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "errors", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_extensions.ts b/benchmarks/tsb/bench_extensions.ts new file mode 100644 index 00000000..fb21dbbd --- /dev/null +++ b/benchmarks/tsb/bench_extensions.ts @@ -0,0 +1,114 @@ +/** + * Benchmark: pd.api.extensions — ExtensionDtype / ExtensionArray / accessor registration. + * + * Covers: + * - ExtensionDtype subclassing → pandas `pandas.api.extensions.ExtensionDtype` + * - ExtensionArray subclassing → pandas `pandas.api.extensions.ExtensionArray` + * - registerExtensionDtype() → pandas `register_extension_dtype()` + * - constructExtensionDtypeFromString() → pandas dtype string resolution + * - registerSeriesAccessor() → pandas `register_series_accessor()` + * - registerDataFrameAccessor() → pandas `register_dataframe_accessor()` + * - getRegisteredAccessors() → accessor registry lookup + * + * Outputs JSON: {"function": "extensions", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + ExtensionDtype, + ExtensionArray, + registerExtensionDtype, + constructExtensionDtypeFromString, + registerSeriesAccessor, + registerDataFrameAccessor, + getRegisteredAccessors, +} from "../../src/index.ts"; + +const WARMUP = 5; +const ITERATIONS = 200; + +class TagDtype extends ExtensionDtype { + override get name(): string { + return "tag"; + } + override get type(): abstract new (...args: readonly unknown[]) => unknown { + return String as unknown as abstract new (...args: readonly unknown[]) => unknown; + } + override get kind(): string { + return "O"; + } + override get isNumeric(): boolean { + return false; + } + static override construct_from_string(dtype: string): TagDtype | null { + return dtype === "tag" ? new TagDtype() : null; + } +} + +class TagArray extends ExtensionArray { + private readonly _data: readonly string[]; + constructor(data: readonly string[]) { + super(); + this._data = data; + } + override get dtype(): TagDtype { + return new TagDtype(); + } + override get length(): number { + return this._data.length; + } + override getItem(i: number): string | null { + const idx = i < 0 ? this._data.length + i : i; + return this._data[idx] ?? null; + } + override slice(start: number, stop: number): TagArray { + return new TagArray(this._data.slice(start, stop)); + } +} + +class GeoAccessor { + constructor(_obj: unknown) {} + distance(): number { + return 0; + } +} + +// Register once — idempotent for repeated benchmark runs +registerExtensionDtype(TagDtype as unknown as { new (): ExtensionDtype } & typeof ExtensionDtype); +registerSeriesAccessor("geo_bench", GeoAccessor); +registerDataFrameAccessor("geo_bench", GeoAccessor); + +function run(): void { + const dt = constructExtensionDtypeFromString("tag"); + const _name = dt?.name; + + const arr = new TagArray(["alpha", "beta", "gamma", "delta", "epsilon"]); + const _len = arr.length; + const _item = arr.getItem(2); + const _neg = arr.getItem(-1); + const _sliced = arr.slice(1, 4); + const _dtype = arr.dtype.name; + const _numeric = arr.dtype.isNumeric; + + const seriesMap = getRegisteredAccessors("series"); + const _hasSeries = seriesMap.has("geo_bench"); + const dfMap = getRegisteredAccessors("dataframe"); + const _hasDf = dfMap.has("geo_bench"); + const idxMap = getRegisteredAccessors("index"); + const _idxSize = idxMap.size; + + void [_name, _len, _item, _neg, _sliced, _dtype, _numeric, _hasSeries, _hasDf, _idxSize]; +} + +for (let i = 0; i < WARMUP; i++) run(); + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) run(); +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "extensions", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_filter_series.ts b/benchmarks/tsb/bench_filter_series.ts new file mode 100644 index 00000000..d1bdef87 --- /dev/null +++ b/benchmarks/tsb/bench_filter_series.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: filterSeries — filter Series index labels by items/like/regex + * Outputs JSON: {"function": "filter_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, filterSeries } from "../../src/index.ts"; + +const N = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Series with string labels: "label_0", "label_1", ..., "label_N-1" +const labels = Array.from({ length: N }, (_, i) => `label_${i}`); +const values = Array.from({ length: N }, (_, i) => i * 0.5); +const s = new Series({ data: values, index: labels }); + +// Pre-build a set of 1000 items to keep +const keepItems = Array.from({ length: 1_000 }, (_, i) => `label_${i * 100}`); + +for (let i = 0; i < WARMUP; i++) { + filterSeries(s, { items: keepItems }); + filterSeries(s, { like: "label_5" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + filterSeries(s, { items: keepItems }); + filterSeries(s, { like: "label_5" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "filter_series", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_get_set_option.ts b/benchmarks/tsb/bench_get_set_option.ts new file mode 100644 index 00000000..c9c4d07a --- /dev/null +++ b/benchmarks/tsb/bench_get_set_option.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: getOption / setOption / resetOption — pandas options API. + * + * Mirrors pandas `pd.get_option`, `pd.set_option`, `pd.reset_option`. + * Outputs JSON: {"function": "get_set_option", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { getOption, setOption, resetOption } from "../../src/index.ts"; + +const WARMUP = 10; +const ITERATIONS = 10_000; + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + getOption("display.max_rows"); + setOption("display.max_rows", 50); + resetOption("display.max_rows"); + getOption("display.precision"); + setOption("display.precision", 3); + resetOption("display.precision"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + getOption("display.max_rows"); + setOption("display.max_rows", (i % 90) + 10); + resetOption("display.max_rows"); + getOption("display.precision"); + setOption("display.precision", (i % 8) + 2); + resetOption("display.precision"); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "get_set_option", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms: total_ms, + }), +); diff --git a/benchmarks/tsb/bench_item_bool_extract.ts b/benchmarks/tsb/bench_item_bool_extract.ts new file mode 100644 index 00000000..6b2a940e --- /dev/null +++ b/benchmarks/tsb/bench_item_bool_extract.ts @@ -0,0 +1,49 @@ +/** + * Benchmark: itemSeries / boolSeries / boolDataFrame — single-element scalar extraction. + * + * Covers functions in scalar_extract.ts not benchmarked by bench_scalar_extract + * (which benchmarks squeeze, firstValidIndex, lastValidIndex but not item/bool). + * + * Mirrors pandas: + * - Series.item() → itemSeries + * - bool(pd.Series([True])) → boolSeries + * - bool(pd.DataFrame([[1]])) → boolDataFrame + * + * Single-element objects are created once outside the loop; the hot path is + * the repeated extraction call itself. + * + * Outputs JSON: {"function": "item_bool_extract", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, itemSeries, boolSeries, boolDataFrame } from "../../src/index.ts"; + +const WARMUP = 20; +const ITERATIONS = 100_000; + +// Single-element Series / DataFrames (reused each iteration). +const numericSeries = new Series({ data: [42.5] }); +const trueSeries = new Series({ data: [true] }); +const trueDF = DataFrame.fromColumns({ x: [true] }); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + itemSeries(numericSeries); + boolSeries(trueSeries); + boolDataFrame(trueDF); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + itemSeries(numericSeries); + boolSeries(trueSeries); + boolDataFrame(trueDF); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "item_bool_extract", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_join_all.ts b/benchmarks/tsb/bench_join_all.ts new file mode 100644 index 00000000..2dfb3358 --- /dev/null +++ b/benchmarks/tsb/bench_join_all.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: joinAll — sequential left-join of 4 DataFrames each with 5k rows. + * Outputs JSON: {"function": "join_all", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, joinAll } from "../../src/index.ts"; + +const N = 5_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const idx = Array.from({ length: N }, (_, i) => String(i)); + +// Base DataFrame and three others — distinct column names, shared index +const base = DataFrame.fromColumns({ a: Array.from({ length: N }, (_, i) => i) }, { index: idx }); +const df1 = DataFrame.fromColumns({ b: Array.from({ length: N }, (_, i) => i * 2) }, { index: idx }); +const df2 = DataFrame.fromColumns({ c: Array.from({ length: N }, (_, i) => i * 3) }, { index: idx }); +const df3 = DataFrame.fromColumns({ d: Array.from({ length: N }, (_, i) => i * 4) }, { index: idx }); + +for (let i = 0; i < WARMUP; i++) { + joinAll(base, [df1, df2, df3]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + joinAll(base, [df1, df2, df3]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "join_all", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_math_ops.ts b/benchmarks/tsb/bench_math_ops.ts new file mode 100644 index 00000000..5559bde5 --- /dev/null +++ b/benchmarks/tsb/bench_math_ops.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: math_ops — absSeries / absDataFrame / roundSeries / roundDataFrame on 100k rows. + * Outputs JSON: {"function": "math_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, absSeries, absDataFrame, roundSeries, roundDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 2 === 0 ? -(i + 0.567) : i + 0.567)) }); +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => -(i + 0.123)), + b: Array.from({ length: SIZE }, (_, i) => i + 0.456), +}); + +for (let i = 0; i < WARMUP; i++) { + absSeries(s); + absDataFrame(df); + roundSeries(s, 1); + roundDataFrame(df, 1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + absSeries(s); + absDataFrame(df); + roundSeries(s, 1); + roundDataFrame(df, 1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "math_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_merge_asof.ts b/benchmarks/tsb/bench_merge_asof.ts new file mode 100644 index 00000000..9ef2a2b8 --- /dev/null +++ b/benchmarks/tsb/bench_merge_asof.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: mergeAsof — backward asof join of two 10k-row sorted DataFrames. + * Outputs JSON: {"function": "merge_asof", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, mergeAsof } from "../../src/index.ts"; + +const N = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Trades sorted by time: 0, 2, 4, ... +const tradeTimes = Array.from({ length: N }, (_, i) => i * 2); +const prices = Array.from({ length: N }, (_, i) => 100.0 + i * 0.5); + +// Quotes sorted by time, sparser: 0, 3, 6, ... +const quoteTimes = Array.from({ length: N }, (_, i) => i * 3); +const bids = Array.from({ length: N }, (_, i) => 99.0 + i * 0.5); + +const trades = DataFrame.fromColumns({ time: tradeTimes, price: prices }); +const quotes = DataFrame.fromColumns({ time: quoteTimes, bid: bids }); + +for (let i = 0; i < WARMUP; i++) { + mergeAsof(trades, quotes, { on: "time" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + mergeAsof(trades, quotes, { on: "time" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "merge_asof", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_na_ops.ts b/benchmarks/tsb/bench_na_ops.ts new file mode 100644 index 00000000..31990d0c --- /dev/null +++ b/benchmarks/tsb/bench_na_ops.ts @@ -0,0 +1,47 @@ +/** + * Benchmark: na_ops — isna / notna / ffillSeries / bfillSeries on 100k rows. + * Outputs JSON: {"function": "na_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, isna, notna, ffillSeries, bfillSeries, dataFrameFfill, dataFrameBfill } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 5 === 0 ? null : i, +); +const s = new Series({ data }); +const df = DataFrame.fromColumns({ + a: data, + b: Array.from({ length: SIZE }, (_, i) => (i % 7 === 0 ? null : i * 2)), +}); + +for (let i = 0; i < WARMUP; i++) { + isna(s); + notna(s); + ffillSeries(s); + bfillSeries(s); + dataFrameFfill(df); + dataFrameBfill(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + isna(s); + notna(s); + ffillSeries(s); + bfillSeries(s); + dataFrameFfill(df); + dataFrameBfill(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "na_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_nanprod.ts b/benchmarks/tsb/bench_nanprod.ts new file mode 100644 index 00000000..52350baa --- /dev/null +++ b/benchmarks/tsb/bench_nanprod.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: nanprod() — product of array values, ignoring NaN/null. + * Outputs JSON: {"function": "nanprod", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { nanprod } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => + i % 13 === 0 ? null : 1 + (i % 7) * 0.0001, +); + +for (let i = 0; i < WARMUP; i++) { + nanprod(data); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + nanprod(data); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "nanprod", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_notna_boolean.ts b/benchmarks/tsb/bench_notna_boolean.ts new file mode 100644 index 00000000..ecd113db --- /dev/null +++ b/benchmarks/tsb/bench_notna_boolean.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: notna_boolean — keepTrue / keepFalse / filterBy on 100k rows. + * Outputs JSON: {"function": "notna_boolean", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, keepTrue, keepFalse, filterBy } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i) }); +const mask = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 2 === 0) }); +const boolArr = Array.from({ length: SIZE }, (_, i) => i % 3 !== 0); + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i), + b: Array.from({ length: SIZE }, (_, i) => i * 2), +}); + +for (let i = 0; i < WARMUP; i++) { + keepTrue(s, mask); + keepFalse(s, mask); + filterBy(df, boolArr); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + keepTrue(s, mask); + keepFalse(s, mask); + filterBy(df, boolArr); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "notna_boolean", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_numeric_ops_log2_exp.ts b/benchmarks/tsb/bench_numeric_ops_log2_exp.ts new file mode 100644 index 00000000..b717b219 --- /dev/null +++ b/benchmarks/tsb/bench_numeric_ops_log2_exp.ts @@ -0,0 +1,66 @@ +/** + * Benchmark: seriesLog2 / seriesLog10 / seriesExp / seriesSign and DataFrame variants. + * + * Mirrors numpy/pandas element-wise math functions on 100k-row data: + * - np.log2(s), np.log10(s), np.exp(s), np.sign(s) + * - DataFrame.apply(np.log2), etc. + * + * Outputs JSON: {"function": "numeric_ops_log2_exp", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + Series, + DataFrame, + seriesLog2, + seriesLog10, + seriesExp, + seriesSign, + dataFrameLog2, + dataFrameLog10, + dataFrameExp, + dataFrameSign, +} from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Positive values for log2/log10; any values for exp/sign +const data = Array.from({ length: SIZE }, (_, i) => (i + 1) * 0.1); +const s = new Series({ data }); +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i + 1) * 0.1), + b: Array.from({ length: SIZE }, (_, i) => (i + 1) * 0.2), +}); + +for (let i = 0; i < WARMUP; i++) { + seriesLog2(s); + seriesLog10(s); + seriesExp(s); + seriesSign(s); + dataFrameLog2(df); + dataFrameLog10(df); + dataFrameExp(df); + dataFrameSign(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesLog2(s); + seriesLog10(s); + seriesExp(s); + seriesSign(s); + dataFrameLog2(df); + dataFrameLog10(df); + dataFrameExp(df); + dataFrameSign(df); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "numeric_ops_log2_exp", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_option_context.ts b/benchmarks/tsb/bench_option_context.ts new file mode 100644 index 00000000..f52b9729 --- /dev/null +++ b/benchmarks/tsb/bench_option_context.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: describeOption / optionContext — pandas options describe and context manager. + * + * The existing bench_get_set_option covers getOption / setOption / resetOption. + * This benchmark covers the remaining options API: + * - describeOption(key?) → string — describe one or all option(s) + * - optionContext("key", value).enter() / .exit() — temporary option override + * + * Mirrors pandas: + * - pd.describe_option("display.max_rows") → describeOption + * - with pd.option_context(...) → optionContext + enter/exit + * + * Outputs JSON: {"function": "option_context", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { describeOption, optionContext } from "../../src/index.ts"; + +const WARMUP = 20; +const ITERATIONS = 50_000; + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + describeOption("display.max_rows"); + describeOption("display.precision"); + const ctx = optionContext("display.max_rows", 50, "display.precision", 3); + ctx.enter(); + ctx.exit(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + describeOption("display.max_rows"); + describeOption("display.precision"); + const ctx = optionContext("display.max_rows", 50, "display.precision", 3); + ctx.enter(); + ctx.exit(); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "option_context", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_pow_mod.ts b/benchmarks/tsb/bench_pow_mod.ts new file mode 100644 index 00000000..1873099c --- /dev/null +++ b/benchmarks/tsb/bench_pow_mod.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: seriesPow, seriesMod, dataFramePow on 100k rows + */ +import { Series, DataFrame, seriesPow, seriesMod, dataFramePow } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 100) + 1); +const s = new Series({ data }); + +const dfData = { + a: Array.from({ length: ROWS }, (_, i) => (i % 100) + 1), + b: Array.from({ length: ROWS }, (_, i) => (i % 50) + 1), +}; +const df = new DataFrame(dfData); + +for (let i = 0; i < WARMUP; i++) { + seriesPow(s, 2); + seriesMod(s, 7); + dataFramePow(df, 2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesPow(s, 2); + seriesMod(s, 7); + dataFramePow(df, 2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pow_mod", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_read_html.ts b/benchmarks/tsb/bench_read_html.ts new file mode 100644 index 00000000..3cbc7149 --- /dev/null +++ b/benchmarks/tsb/bench_read_html.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: readHtml — parse HTML tables into DataFrames. + * Outputs JSON: {"function": "read_html", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { readHtml } from "../../src/index.js"; + +const ROWS = 1_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Build a realistic HTML string with a 1000-row table. +function buildHtml(rows: number): string { + const header = "idnamevaluescore"; + const bodyRows: string[] = []; + for (let i = 0; i < rows; i++) { + bodyRows.push( + `${i}item_${i % 100}${(i * 1.5).toFixed(2)}${Math.sin(i * 0.01).toFixed(6)}`, + ); + } + return `${header}${bodyRows.join("")}
`; +} + +const html = buildHtml(ROWS); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + readHtml(html); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + readHtml(html); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "read_html", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_reduce_ops.ts b/benchmarks/tsb/bench_reduce_ops.ts new file mode 100644 index 00000000..f2e524f7 --- /dev/null +++ b/benchmarks/tsb/bench_reduce_ops.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: reduce_ops — nuniqueSeries / anySeries / allSeries / nunique(df) on 100k rows. + * Outputs JSON: {"function": "reduce_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, nuniqueSeries, anySeries, allSeries, nunique } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 1000) }); +const boolSeries = new Series({ data: Array.from({ length: SIZE }, (_, i) => i > 0) }); +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i % 500), + b: Array.from({ length: SIZE }, (_, i) => i % 200), + c: Array.from({ length: SIZE }, (_, i) => i % 100), +}); + +for (let i = 0; i < WARMUP; i++) { + nuniqueSeries(s); + anySeries(boolSeries); + allSeries(boolSeries); + nunique(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nuniqueSeries(s); + anySeries(boolSeries); + allSeries(boolSeries); + nunique(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "reduce_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_rename_ops.ts b/benchmarks/tsb/bench_rename_ops.ts new file mode 100644 index 00000000..9277e6e6 --- /dev/null +++ b/benchmarks/tsb/bench_rename_ops.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: rename_ops — renameSeriesIndex / renameDataFrame / addPrefixDataFrame / addSuffixDataFrame on 100k rows. + * Outputs JSON: {"function": "rename_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, renameSeriesIndex, renameDataFrame, addPrefixDataFrame, addSuffixDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i), index: Array.from({ length: SIZE }, (_, i) => `row_${i}`) }); +const df = DataFrame.fromColumns({ + col_a: Array.from({ length: SIZE }, (_, i) => i), + col_b: Array.from({ length: SIZE }, (_, i) => i * 2), + col_c: Array.from({ length: SIZE }, (_, i) => i * 3), +}); + +for (let i = 0; i < WARMUP; i++) { + renameSeriesIndex(s, (lbl) => `new_${String(lbl)}`); + renameDataFrame(df, { columns: { col_a: "a", col_b: "b" } }); + addPrefixDataFrame(df, "pre_"); + addSuffixDataFrame(df, "_suf"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + renameSeriesIndex(s, (lbl) => `new_${String(lbl)}`); + renameDataFrame(df, { columns: { col_a: "a", col_b: "b" } }); + addPrefixDataFrame(df, "pre_"); + addSuffixDataFrame(df, "_suf"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "rename_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_resample_dataframe.ts b/benchmarks/tsb/bench_resample_dataframe.ts new file mode 100644 index 00000000..f9e656e7 --- /dev/null +++ b/benchmarks/tsb/bench_resample_dataframe.ts @@ -0,0 +1,49 @@ +/** + * Benchmark: resampleDataFrame — DataFrame resampling with multiple aggregations. + * + * The existing `resample` benchmark only covers Series. This benchmark exercises + * resampleDataFrame on a multi-column datetime-indexed DataFrame, mirroring pandas + * df.resample("1h").mean() / .sum() / .min(). + * + * Outputs JSON: {"function": "resample_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, resampleDataFrame } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 3; +const ITERATIONS = 30; + +const base = new Date("2020-01-01T00:00:00Z").getTime(); +const idx = Array.from({ length: SIZE }, (_, i) => new Date(base + i * 60_000)); + +const df = DataFrame.fromColumns( + { + a: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 50 + 50), + b: Array.from({ length: SIZE }, (_, i) => Math.cos(i * 0.02) * 30 + 30), + c: Array.from({ length: SIZE }, (_, i) => (i % 100) * 1.5), + }, + { index: idx }, +); + +for (let i = 0; i < WARMUP; i++) { + resampleDataFrame(df, "H").mean(); + resampleDataFrame(df, "H").sum(); + resampleDataFrame(df, "H").min(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + resampleDataFrame(df, "H").mean(); + resampleDataFrame(df, "H").sum(); + resampleDataFrame(df, "H").min(); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "resample_dataframe", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_series_compare_pair.ts b/benchmarks/tsb/bench_series_compare_pair.ts new file mode 100644 index 00000000..ddf56659 --- /dev/null +++ b/benchmarks/tsb/bench_series_compare_pair.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: Series-to-Series comparison operations (seriesNe, seriesGt, seriesLe). + * + * The existing `compare` benchmark only tests scalar comparison (s.eq(500)). + * This benchmark tests element-wise comparison between two Series of 100k elements, + * mirroring pandas s1.ne(s2), s1.gt(s2), s1.le(s2). + * + * Outputs JSON: {"function": "series_compare_pair", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesNe, seriesGt, seriesLe, seriesEq } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 100; + +const a = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 1.7) % 1000) }); +const b = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 2.3) % 1000) }); + +for (let i = 0; i < WARMUP; i++) { + seriesNe(a, b); + seriesGt(a, b); + seriesLe(a, b); + seriesEq(a, b); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesNe(a, b); + seriesGt(a, b); + seriesLe(a, b); + seriesEq(a, b); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_compare_pair", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_series_dot_dataframe.ts b/benchmarks/tsb/bench_series_dot_dataframe.ts new file mode 100644 index 00000000..94e4f546 --- /dev/null +++ b/benchmarks/tsb/bench_series_dot_dataframe.ts @@ -0,0 +1,54 @@ +/** + * Benchmark: seriesDotDataFrame and dataFrameDotSeries — cross-form dot products. + * + * The existing bench_dot_matmul covers seriesDotSeries and dataFrameDotDataFrame. + * This benchmark exercises the remaining cross-form variants: + * - seriesDotDataFrame(s, df) → Series (Series × DataFrame matrix multiply) + * - dataFrameDotSeries(df, s) → Series (DataFrame × Series matrix multiply) + * + * Mirrors pandas: + * - pd.Series.dot(DataFrame) → pd.Series + * - pd.DataFrame.dot(Series) → pd.Series + * + * Dataset: 1000-element Series, 1000-row × 20-column DataFrame. + * + * Outputs JSON: {"function": "series_dot_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, seriesDotDataFrame, dataFrameDotSeries } from "../../src/index.ts"; + +const N = 1_000; +const K = 20; +const WARMUP = 5; +const ITERATIONS = 50; + +// Series with N elements, indexed 0..N-1 +const sData = Array.from({ length: N }, (_, i) => (i + 1) * 0.01); +const s = new Series({ data: sData }); + +// DataFrame: N rows × K columns, indexed 0..N-1, columns "c0".."c19" +const cols: Record = {}; +for (let c = 0; c < K; c++) { + cols[`c${c}`] = Array.from({ length: N }, (_, i) => (i * K + c) * 0.001); +} +const df = DataFrame.fromColumns(cols); + +for (let i = 0; i < WARMUP; i++) { + seriesDotDataFrame(s, df); + dataFrameDotSeries(df, s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesDotDataFrame(s, df); + dataFrameDotSeries(df, s); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_dot_dataframe", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_series_format_table.ts b/benchmarks/tsb/bench_series_format_table.ts new file mode 100644 index 00000000..11683ffc --- /dev/null +++ b/benchmarks/tsb/bench_series_format_table.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: seriesToMarkdown and seriesToLaTeX on a 500-element Series. + * + * Mirrors pandas Series.to_markdown() and Series.to_latex(). + * Exercises table-rendering of both numeric and mixed-type series. + */ +import { Series, seriesToMarkdown, seriesToLaTeX } from "../../src/index.ts"; +import type { Scalar } from "../../src/types.ts"; + +const N = 500; +const WARMUP = 3; +const ITERATIONS = 30; + +const numData: number[] = Array.from({ length: N }, (_, i) => Math.sin(i * 0.05) * 100); +const strData: Scalar[] = Array.from({ length: N }, (_, i) => (i % 10 === 0 ? null : `item_${i}`)); + +const numSeries = new Series({ data: numData }); +const strSeries = new Series({ data: strData }); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + seriesToMarkdown(numSeries); + seriesToLaTeX(numSeries); + seriesToMarkdown(strSeries); + seriesToLaTeX(strSeries); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesToMarkdown(numSeries); + seriesToLaTeX(numSeries); + seriesToMarkdown(strSeries); + seriesToLaTeX(strSeries); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_format_table", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_series_items_iter.ts b/benchmarks/tsb/bench_series_items_iter.ts new file mode 100644 index 00000000..4750413c --- /dev/null +++ b/benchmarks/tsb/bench_series_items_iter.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: Series.items() / Series.iteritems() — iterate over (label, value) pairs. + * Outputs JSON: {"function": "series_items_iter", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ + data: Array.from({ length: SIZE }, (_, i) => i * 1.1), + index: Array.from({ length: SIZE }, (_, i) => `row_${i}`), +}); + +for (let i = 0; i < WARMUP; i++) { + for (const _pair of s.items()) { + /* warm up */ + } + for (const _pair of s.iteritems()) { + /* warm up */ + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + for (const _pair of s.items()) { + /* iterate */ + } + for (const _pair of s.iteritems()) { + /* iterate */ + } + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "series_items_iter", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_setaxis_toframe.ts b/benchmarks/tsb/bench_series_setaxis_toframe.ts new file mode 100644 index 00000000..e8a24fb3 --- /dev/null +++ b/benchmarks/tsb/bench_series_setaxis_toframe.ts @@ -0,0 +1,75 @@ +/** + * Benchmark: seriesToFrame / setAxisSeries / setAxisDataFrame / addPrefixSeries / addSuffixSeries + * + * Covers rename_ops functions not benchmarked by bench_rename_ops (which only benchmarks + * renameSeriesIndex, renameDataFrame, addPrefixDataFrame, addSuffixDataFrame). + * + * Mirrors pandas: + * - Series.to_frame() → seriesToFrame + * - Series.set_axis(labels) → setAxisSeries + * - DataFrame.set_axis(labels) → setAxisDataFrame + * - Series.add_prefix(prefix) → addPrefixSeries + * - Series.add_suffix(suffix) → addSuffixSeries + * + * Dataset: 50 000-element numeric Series; 50 000-row × 3-column DataFrame. + * + * Outputs JSON: {"function": "series_setaxis_toframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + Series, + DataFrame, + seriesToFrame, + setAxisSeries, + setAxisDataFrame, + addPrefixSeries, + addSuffixSeries, +} from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => i * 1.5); +const idx = Array.from({ length: SIZE }, (_, i) => `r${i}`); +const newIdx = Array.from({ length: SIZE }, (_, i) => `row_${i}`); + +const s = new Series({ data, index: idx, name: "values" }); +const df = DataFrame.fromColumns( + { + a: Array.from({ length: SIZE }, (_, i) => i), + b: Array.from({ length: SIZE }, (_, i) => i * 2), + c: Array.from({ length: SIZE }, (_, i) => i * 3), + }, + { index: idx }, +); +const newCols = ["col_a", "col_b", "col_c"]; + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + seriesToFrame(s); + setAxisSeries(s, newIdx); + setAxisDataFrame(df, newIdx, 0); + setAxisDataFrame(df, newCols, 1); + addPrefixSeries(s, "pre_"); + addSuffixSeries(s, "_suf"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesToFrame(s); + setAxisSeries(s, newIdx); + setAxisDataFrame(df, newIdx, 0); + setAxisDataFrame(df, newCols, 1); + addPrefixSeries(s, "pre_"); + addSuffixSeries(s, "_suf"); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_setaxis_toframe", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_series_to_markdown.ts b/benchmarks/tsb/bench_series_to_markdown.ts new file mode 100644 index 00000000..bcaffa46 --- /dev/null +++ b/benchmarks/tsb/bench_series_to_markdown.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: seriesToMarkdown and seriesToLaTeX on a 500-element numeric Series. + * + * The existing `to_markdown` benchmark covers DataFrames only. + * This benchmark exercises the Series variants: seriesToMarkdown / seriesToLaTeX. + * Mirrors pandas Series.to_markdown() and Series.to_latex(). + * + * Outputs JSON: {"function": "series_to_markdown", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesToMarkdown, seriesToLaTeX } from "../../src/index.ts"; + +const SIZE = 500; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ + data: Array.from({ length: SIZE }, (_, i) => (i * 1.7) % 100), + name: "values", +}); + +for (let i = 0; i < WARMUP; i++) { + seriesToMarkdown(s); + seriesToLaTeX(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesToMarkdown(s); + seriesToLaTeX(s); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_to_markdown", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_shift_diff.ts b/benchmarks/tsb/bench_shift_diff.ts new file mode 100644 index 00000000..49a8ae4a --- /dev/null +++ b/benchmarks/tsb/bench_shift_diff.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: shiftSeries and diffSeries on 100k-element Series + */ +import { Series, shiftSeries, diffSeries } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => i * 1.5); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + shiftSeries(s, 1); + diffSeries(s, 1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + shiftSeries(s, 1); + diffSeries(s, 1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "shift_diff", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_sort_ops.ts b/benchmarks/tsb/bench_sort_ops.ts new file mode 100644 index 00000000..684f1b6e --- /dev/null +++ b/benchmarks/tsb/bench_sort_ops.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: sortValuesSeries and sortValuesDataFrame on 100k rows + */ +import { Series, DataFrame, sortValuesSeries, sortValuesDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => Math.sin(i) * 1000); +const s = new Series({ data }); + +const dfData = { + a: Array.from({ length: ROWS }, (_, i) => Math.sin(i) * 1000), + b: Array.from({ length: ROWS }, (_, i) => Math.cos(i) * 500), +}; +const df = new DataFrame(dfData); + +for (let i = 0; i < WARMUP; i++) { + sortValuesSeries(s); + sortValuesDataFrame(df, "a"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + sortValuesSeries(s); + sortValuesDataFrame(df, "a"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "sort_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_findall_expand.ts b/benchmarks/tsb/bench_str_findall_expand.ts new file mode 100644 index 00000000..4b4e5deb --- /dev/null +++ b/benchmarks/tsb/bench_str_findall_expand.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: strFindallExpand on a 5k-element string Series. + * + * Mirrors pandas Series.str.extract() with named capture groups. + * Each string has the form "name42 score88 level3" so the regex + * captures three named groups: word, number, and level. + */ +import { Series, strFindallExpand } from "../../src/index.ts"; +import type { Scalar } from "../../src/types.ts"; + +const N = 5_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data: Scalar[] = Array.from( + { length: N }, + (_, i) => (i % 20 === 0 ? null : `user${i} score${(i * 7) % 100} level${(i % 5) + 1}`), +); +const s = new Series({ data }); + +// Named capture-group pattern: extract word, score, and level +const pat = /(?[a-z]+)(?\d+)\s+score(?\d+)\s+level(?\d+)/; + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + strFindallExpand(s, pat); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + strFindallExpand(s, pat); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_findall_expand", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_styler_format.ts b/benchmarks/tsb/bench_styler_format.ts new file mode 100644 index 00000000..294fd472 --- /dev/null +++ b/benchmarks/tsb/bench_styler_format.ts @@ -0,0 +1,52 @@ +/** + * Benchmark: Styler.format / apply / applymap / toHtml — Styler formatting chain. + * + * Covers Styler methods not included in bench_styler: + * - format(fn) → pandas `df.style.format(fn)` + * - formatIndex(fn) → pandas `df.style.format_index(fn)` (pandas 1.4+) + * - apply(fn) → pandas `df.style.apply(fn)` + * - applymap(fn) → pandas `df.style.applymap(fn)` / `map(fn)` (pandas 2.1+) + * - toHtml() → pandas `df.style.to_html()` + * + * Outputs JSON: {"function": "styler_format", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameStyle } from "../../src/index.ts"; + +const ROWS = 100; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Float64Array.from({ length: ROWS }, (_, i) => i * 1.5), + b: Float64Array.from({ length: ROWS }, (_, i) => (ROWS - i) * 2.0), + c: Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i / 10) * 50 + 50), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameStyle(df) + .format((v) => (typeof v === "number" ? v.toFixed(2) : String(v))) + .formatIndex((v) => `r${String(v)}`) + .apply((vals) => vals.map(() => "color: navy")) + .applymap((v) => (typeof v === "number" && (v as number) > 50 ? "font-weight: bold" : "")) + .toHtml(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameStyle(df) + .format((v) => (typeof v === "number" ? v.toFixed(2) : String(v))) + .formatIndex((v) => `r${String(v)}`) + .apply((vals) => vals.map(() => "color: navy")) + .applymap((v) => (typeof v === "number" && (v as number) > 50 ? "font-weight: bold" : "")) + .toHtml(); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "styler_format", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_styler_highlight_adv.ts b/benchmarks/tsb/bench_styler_highlight_adv.ts new file mode 100644 index 00000000..1e848c26 --- /dev/null +++ b/benchmarks/tsb/bench_styler_highlight_adv.ts @@ -0,0 +1,56 @@ +/** + * Benchmark: Styler advanced highlighting — highlightNull / highlightBetween / + * textGradient / barChart / setCaption / toLatex. + * + * Covers Styler methods not included in bench_styler: + * - highlightNull() → pandas `df.style.highlight_null()` + * - highlightBetween() → pandas `df.style.highlight_between()` + * - textGradient() → pandas `df.style.text_gradient()` + * - barChart() → pandas `df.style.bar()` + * - setCaption(caption) → pandas `df.style.set_caption(caption)` + * - toLatex() → pandas `df.style.to_latex()` + * + * Outputs JSON: {"function": "styler_highlight_adv", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameStyle } from "../../src/index.ts"; + +const ROWS = 100; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Float64Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i): number | null => (i % 10 === 0 ? null : i * 2.0)), + c: Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i / 10) * 50 + 50), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameStyle(df) + .highlightNull("red") + .highlightBetween({ left: 20, right: 80, color: "lightyellow" }) + .textGradient({ cmap: "Blues" }) + .barChart({ align: "mid", color: "#aec6cf" }) + .setCaption("Benchmark Table") + .toLatex(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameStyle(df) + .highlightNull("red") + .highlightBetween({ left: 20, right: 80, color: "lightyellow" }) + .textGradient({ cmap: "Blues" }) + .barChart({ align: "mid", color: "#aec6cf" }) + .setCaption("Benchmark Table") + .toLatex(); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "styler_highlight_adv", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_styler_table_props.ts b/benchmarks/tsb/bench_styler_table_props.ts new file mode 100644 index 00000000..7ade8b2b --- /dev/null +++ b/benchmarks/tsb/bench_styler_table_props.ts @@ -0,0 +1,59 @@ +/** + * Benchmark: Styler table-level configuration — setProperties / setTableStyles / + * setTableAttributes / hide / setPrecision / setNaRep / clearStyles / toHtml. + * + * Covers Styler configuration methods not included in other styler benchmarks: + * - setPrecision(n) → pandas `df.style.set_precision(n)` + * - setNaRep(s) → pandas `df.style.set_na_rep(s)` + * - setProperties(props,subset) → pandas `df.style.set_properties(subset=…)` + * - setTableStyles(styles) → pandas `df.style.set_table_styles()` + * - setTableAttributes(attrs) → pandas `df.style.set_table_attributes()` + * - hide(0) → pandas `df.style.hide(axis="index")` + * - hide(1, subset) → pandas `df.style.hide(subset=…, axis="columns")` + * - clearStyles() → pandas `df.style.clear()` + * - toHtml() → pandas `df.style.to_html()` + * + * Outputs JSON: {"function": "styler_table_props", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameStyle } from "../../src/index.ts"; + +const ROWS = 100; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Float64Array.from({ length: ROWS }, (_, i) => i * 1.5), + b: Array.from({ length: ROWS }, (_, i): number | null => (i % 10 === 0 ? null : i * 2.0)), + c: Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i / 10) * 50 + 50), +}); + +function run(): void { + dataFrameStyle(df) + .setPrecision(3) + .setNaRep("—") + .setProperties({ "font-size": "12px", color: "navy" }, ["a", "b"]) + .setTableStyles([ + { selector: "th", props: { "background-color": "#4a90d9", color: "white" } }, + { selector: "tr:nth-child(even) td", props: { "background-color": "#f5f5f5" } }, + ]) + .setTableAttributes('class="data-table" id="bench-table"') + .hide(0) + .hide(1, ["c"]) + .clearStyles() + .toHtml(); +} + +for (let i = 0; i < WARMUP; i++) run(); + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) run(); +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "styler_table_props", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_to_json_denormalize.ts b/benchmarks/tsb/bench_to_json_denormalize.ts new file mode 100644 index 00000000..07a42f5f --- /dev/null +++ b/benchmarks/tsb/bench_to_json_denormalize.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: to_json_denormalize — toJsonDenormalize / toJsonRecords / toJsonSplit / toJsonIndex + * Outputs JSON: {"function": "to_json_denormalize", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, toJsonDenormalize, toJsonRecords, toJsonSplit, toJsonIndex } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Create a nested-structure-like DataFrame (address.city, address.zip pattern) +const df = DataFrame.fromColumns({ + "name": Array.from({ length: ROWS }, (_, i) => `user_${i}`), + "address.city": Array.from({ length: ROWS }, (_, i) => `city_${i % 100}`), + "address.zip": Array.from({ length: ROWS }, (_, i) => `${10000 + (i % 9000)}`), + "score": Float64Array.from({ length: ROWS }, (_, i) => i * 0.01), +}); + +for (let i = 0; i < WARMUP; i++) { + toJsonDenormalize(df); + toJsonRecords(df); + toJsonSplit(df); + toJsonIndex(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + toJsonDenormalize(df); + toJsonRecords(df); + toJsonSplit(df); + toJsonIndex(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "to_json_denormalize", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_to_latex.ts b/benchmarks/tsb/bench_to_latex.ts new file mode 100644 index 00000000..02c59842 --- /dev/null +++ b/benchmarks/tsb/bench_to_latex.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: toLaTeX / seriesToLaTeX — render DataFrame/Series to LaTeX tabular format. + * + * Mirrors pandas: + * - `DataFrame.to_latex()` → tsb `toLaTeX(df)` + * - `Series.to_latex()` → tsb `seriesToLaTeX(s)` + * + * Outputs JSON: {"function": "to_latex", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, toLaTeX, seriesToLaTeX } from "../../src/index.ts"; + +const ROWS = 500; +const WARMUP = 5; +const ITERATIONS = 100; + +const df = DataFrame.fromColumns({ + name: Array.from({ length: ROWS }, (_, i) => `item_${i}`), + value: Float64Array.from({ length: ROWS }, (_, i) => i * 1.23), + count: Float64Array.from({ length: ROWS }, (_, i) => i), +}); + +const s = new Series({ data: Float64Array.from({ length: ROWS }, (_, i) => i * 0.5) }); + +for (let i = 0; i < WARMUP; i++) { + toLaTeX(df); + toLaTeX(df, { index: false, booktabs: true }); + seriesToLaTeX(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + toLaTeX(df); + toLaTeX(df, { index: false, booktabs: true }); + seriesToLaTeX(s); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "to_latex", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_truncate_df.ts b/benchmarks/tsb/bench_truncate_df.ts new file mode 100644 index 00000000..f2661ce0 --- /dev/null +++ b/benchmarks/tsb/bench_truncate_df.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: truncateDataFrame — slice rows by before/after labels on 100k-row DataFrame + * Outputs JSON: {"function": "truncate_df", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, truncateDataFrame } from "../../src/index.ts"; + +const N = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const index = Array.from({ length: N }, (_, i) => i); +const a = Array.from({ length: N }, (_, i) => i * 1.0); +const b = Array.from({ length: N }, (_, i) => i * 2.0); +const c = Array.from({ length: N }, (_, i) => i * 3.0); + +const df = DataFrame.fromColumns({ a, b, c }, { index }); + +for (let i = 0; i < WARMUP; i++) { + truncateDataFrame(df, 10_000, 90_000); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + truncateDataFrame(df, 10_000, 90_000); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "truncate_df", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_value_counts_full.ts b/benchmarks/tsb/bench_value_counts_full.ts new file mode 100644 index 00000000..d55b5b72 --- /dev/null +++ b/benchmarks/tsb/bench_value_counts_full.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: value_counts_full — valueCountsBinned on 100k rows. + * Outputs JSON: {"function": "value_counts_full", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, valueCountsBinned } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, () => Math.random() * 100) }); + +for (let i = 0; i < WARMUP; i++) { + valueCountsBinned(s, { bins: 10 }); + valueCountsBinned(s, { bins: 20 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + valueCountsBinned(s, { bins: 10 }); + valueCountsBinned(s, { bins: 20 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "value_counts_full", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_window_extended.ts b/benchmarks/tsb/bench_window_extended.ts new file mode 100644 index 00000000..a4b933cb --- /dev/null +++ b/benchmarks/tsb/bench_window_extended.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: window_extended — rollingSem / rollingSkew / rollingKurt / rollingQuantile on 100k rows. + * Outputs JSON: {"function": "window_extended", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, rollingSem, rollingSkew, rollingKurt, rollingQuantile } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; +const WINDOW = 10; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => Math.sin(i / 100) * 100 + i * 0.001) }); + +for (let i = 0; i < WARMUP; i++) { + rollingSem(s, WINDOW); + rollingSkew(s, WINDOW); + rollingKurt(s, WINDOW); + rollingQuantile(s, WINDOW, 0.5); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + rollingSem(s, WINDOW); + rollingSkew(s, WINDOW); + rollingKurt(s, WINDOW); + rollingQuantile(s, WINDOW, 0.5); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "window_extended", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_window_indexers.ts b/benchmarks/tsb/bench_window_indexers.ts new file mode 100644 index 00000000..1eef8d23 --- /dev/null +++ b/benchmarks/tsb/bench_window_indexers.ts @@ -0,0 +1,50 @@ +/** + * Benchmark: FixedForwardWindowIndexer, VariableOffsetWindowIndexer, applyIndexer. + * + * Mirrors pandas.api.indexers.FixedForwardWindowIndexer and + * pandas.api.indexers.VariableOffsetWindowIndexer. + * + * Uses a 50k-row dataset. Each iteration: + * - Generates bounds via FixedForwardWindowIndexer (window=5) on 50k rows. + * - Generates bounds via VariableOffsetWindowIndexer with random offsets. + * - Applies applyIndexer with FixedForwardWindowIndexer to compute rolling sum. + * + * Outputs JSON: {"function": "window_indexers", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + FixedForwardWindowIndexer, + VariableOffsetWindowIndexer, + applyIndexer, +} from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const fwdIdx = new FixedForwardWindowIndexer({ windowSize: 5 }); +const offsets = Array.from({ length: SIZE }, (_, i) => (i % 10) + 1); +const varIdx = new VariableOffsetWindowIndexer({ offsets }); +const values = Array.from({ length: SIZE }, (_, i) => (i * 0.1) % 100); + +for (let i = 0; i < WARMUP; i++) { + fwdIdx.getWindowBounds(SIZE); + varIdx.getWindowBounds(SIZE); + applyIndexer(fwdIdx, values, (nums) => nums.reduce((a, b) => a + b, 0)); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + fwdIdx.getWindowBounds(SIZE); + varIdx.getWindowBounds(SIZE); + applyIndexer(fwdIdx, values, (nums) => nums.reduce((a, b) => a + b, 0)); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "window_indexers", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_xs_series.ts b/benchmarks/tsb/bench_xs_series.ts new file mode 100644 index 00000000..cb630e72 --- /dev/null +++ b/benchmarks/tsb/bench_xs_series.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: xsSeries — cross-section lookup on Series. + * + * Mirrors pandas `Series.xs()`. + * Tests flat-index lookup (returns scalar) and MultiIndex lookup (returns sub-Series). + * Outputs JSON: {"function": "xs_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, MultiIndex, xsSeries } from "../../src/index.ts"; + +const N = 1_000; +const WARMUP = 10; +const ITERATIONS = 5_000; + +// Flat-index Series: each key appears once → xsSeries returns a scalar. +const flatData = Array.from({ length: N }, (_, i) => i * 1.5); +const flatIdx = Array.from({ length: N }, (_, i) => `k${i}`); +const flatSeries = new Series({ data: flatData, index: flatIdx, name: "flat" }); + +// MultiIndex Series: 10 outer keys × 100 inner keys → xsSeries returns a sub-Series (100 rows). +const outerKeys = Array.from({ length: N }, (_, i) => `g${Math.floor(i / 100)}`); +const innerKeys = Array.from({ length: N }, (_, i) => i % 100); +const multiIdx = MultiIndex.fromArrays([outerKeys, innerKeys], { names: ["outer", "inner"] }); +const multiData = Array.from({ length: N }, (_, i) => i * 2.0); +const multiSeries = new Series({ data: multiData, index: multiIdx, name: "multi" }); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + xsSeries(flatSeries, `k${i % N}`); + xsSeries(multiSeries, `g${i % 10}`); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + xsSeries(flatSeries, `k${i % N}`); + xsSeries(multiSeries, `g${i % 10}`); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "xs_series", + mean_ms: total_ms / ITERATIONS, + iterations: ITERATIONS, + total_ms: total_ms, + }), +);