# 算出排除当前数据的平均,再取这个类别的平均 deftarget_mean_v1(data, y_name, x_name): result = np.zeros(data.shape[0]) for i in range(data.shape[0]): # group by 就有内在循环了 groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count']) # 当前样本的 x 类别(根据 x 分类) result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')] return result
同时给出数据集 data:
1 2 3
y = np.random.randint(2, size=(5000, 1)) # 两个 x = np.random.randint(10, size=(5000, 1)) data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])
其实统计每个类别的数量和 y 值总数,之后剔除自己就行了,于是根据这个核心思想同时考虑一些代码可读上的建议,我们有了如下代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
deftarget_mean_v2(data, y_name, x_name): nums = data.shape[0] result = np.zeros(nums) # 那么我自己统计总数不也行嘛,一次循环 type_2_sum_dict = {} type_2_count_dict = {} for i in range(nums): x = data.loc[i, x_name] y = data.loc[i, y_name] type_2_sum_dict[x] = type_2_sum_dict.get(x, 0) + y type_2_count_dict[x] = type_2_count_dict.get(x, 0) + 1 for i in range(nums): x = data.loc[i, x_name] y = data.loc[i, y_name] result[i] = (type_2_sum_dict[x] - y) / (type_2_count_dict[x] - 1) return result
cdef Py_ssize_t i cdef Py_ssize_t x, y # 不得放到 for 里头用 cdef # cdef cnp.ndarray[Py_ssize_t] n_y = data[y_name].values # values faster than to_numpy # cdef cnp.ndarray[Py_ssize_t] n_x = data[x_name].values cdef Py_ssize_t[:] n_y = data[y_name].values # values faster than to_numpy cdef Py_ssize_t[:] n_x = data[x_name].values
cdef Py_ssize_t nums = n_x.shape[0] # 这个 2us 优化,一般 # cdef cnp.ndarray[double] result = np.zeros(nums) cdef float[:] result = np.zeros(nums, np.float32) for i from0 <= i < nums by 1: # for i in range(nums): x = n_x[i] y = n_y[i] it = type_2_sum_dict.find(x) if it != type_2_sum_dict.end(): type_2_sum_dict[x] += y type_2_count_dict[x] += 1 else: type_2_sum_dict[x] = y type_2_count_dict[x] = 1 # for i in range(nums): for i from0 <= i < nums by 1: x = n_x[i] y = n_y[i] result[i] = (type_2_sum_dict[x] - y) / (type_2_count_dict[x] - 1) return result