遍历 Dataframes(一般来说)是一种反模式。如果可以的话,总是尽量避免它!
您可以通过减去标量值来轻松地向量化此操作,target而不是将目标视为另一个数组:
# Vectorized squared errors
combopd["SSE"] = sum(
(target[f'x{n}'].values[0] - combopd[f'x{n}'])**2 # squared error
for n in range(1, 9) # for each of the columns x1, x2, ... x8
)
与 10K 行的时间比较显示了非常显着的加速:)
import pandas as pd
import numpy as np
import timeit
ROW_COUNT = 10000
combopd = pd.DataFrame(
np.random.random(size=(ROW_COUNT, 8)), columns=[f'x{n}' for n in range(1, 9)]
)
target = pd.DataFrame(
np.random.random(size=(1, 8)), columns=[f'x{n}' for n in range(1, 9)]
)
def vectorized():
return sum(
(target[f'x{n}'].values[0] - combopd[f'x{n}'])**2
for n in range(1, 9)
)
def original():
for i in range(len(combopd)):
row = combopd.iloc[i]
sse = (
((target["x1"] - row["x1"]) ** 2)
+ ((target["x2"] - row["x2"]) ** 2)
+ ((target["x3"] - row["x3"]) ** 2)
+ ((target["x4"] - row["x4"]) ** 2)
+ ((target["x5"] - row["x5"]) ** 2)
+ ((target["x6"] - row["x6"]) ** 2)
+ ((target["x7"] - row["x7"]) ** 2)
+ ((target["x8"] - row["x8"]) ** 2)
)
combopd.at[row.name, 'SSE'] = sse.values[0]
return combopd['SSE']
assert np.array_equal(vectorized(), original())
vectorized_time = timeit.timeit('vectorized()', globals=globals(), number=10)
original_time = timeit.timeit('original()', globals=globals(), number=10)
print(f'Vectorized time: {vectorized_time:0.4f}s'
f'\nOriginal time: {original_time:0.4f}s')
# Vectorized time: 0.0230s
# Original time: 192.0022s
```