k-means聚类收敛至k-1个中心

本文主要目的是分享一组数据和相关代码. 这组数据当使用前4个数据当初始聚类中心时最后只会收敛到3个聚类中心, 当更改初始聚类中心后不会发生此情况, 我也尝试大量生成随机数据想统计这种情况发生概率, 但是一次也没有重现成功

数据下载地址

https://github.com/UesugiErii/uesugierii.github.io/tree/master/code/54/data

聚类代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pickle
import numpy as np
import os

os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
import tensorflow as tf

with open('vector.pkl', 'rb') as f:
data = pickle.load(f)

print(data.shape) # (20, 64)

data = data[None, :, :]
k = 4 # 最终结果的聚类中心数
n = 20 # 数据长度
mask = np.ones((1, 20), dtype=np.float32)
mask[0, 9:] = 0

data = tf.multiply(data, mask[:, :, None])

num_segments = 1 + k # 几个聚类中心, 开始的1对应值0是给padding的(用mask0遮住的)
centers = data[:, :k, :] # bs, k, dim

# belong 代表每行数据中每个物品属于第几个聚类
last_belong = tf.zeros((data.shape[0], n), dtype=tf.int32)
belong = tf.ones((data.shape[0], n), dtype=tf.int32)

iter_n = 0

while tf.reduce_any(tf.not_equal(last_belong, belong)):
distance = tf.reduce_sum(
tf.math.square(
tf.tile(data[:, :, None, :], [1, 1, k, 1])
-
tf.tile(centers[:, None, :, :], [1, n, 1, 1])
),
axis=-1
)

last_belong = belong

belong = tf.argmin(distance, axis=-1, output_type=tf.dtypes.int32) + 1
belong = tf.multiply(belong, tf.cast(mask, belong.dtype))

num_rows = tf.shape(belong)[0]
rows_idx = tf.range(num_rows)
segment_ids_per_row = belong + num_segments * tf.expand_dims(rows_idx, axis=1)

centers = tf.math.unsorted_segment_mean(data, segment_ids_per_row, num_segments * num_rows)
centers = tf.reshape(centers, (num_rows, num_segments, data.shape[-1]))

centers = centers[:, 1:, :]

iter_n += 1

print(belong)

# tf.Tensor([[1 2 3 4 2 2 1 2 4 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 20), dtype=int32)
# tf.Tensor([[1 3 3 1 3 4 1 4 4 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 20), dtype=int32)
# tf.Tensor([[1 3 3 1 3 4 1 4 4 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 20), dtype=int32)

统计出现频率

我用了下面这段代码大量生成随机数据想统计上述情况出现的概率, 但是一次也没出现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# 尝试随机生成数据, 测试聚类后中心小于目标值的概率
# 3.py加速版本
import numpy as np
import os

os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
import tensorflow as tf

k = 4 # 最终结果的聚类中心数
n = 20 # 数据长度
batch_size = 20480


@tf.function
def f():
data = tf.random.normal(shape=(batch_size, n, 64), mean=0.0, stddev=0.01, dtype=tf.dtypes.float32)

len_ = tf.cast(tf.experimental.numpy.random.randint(7, n, dtype=tf.experimental.numpy.int32), tf.dtypes.int32)
mask1 = tf.ones((batch_size, len_), dtype=np.float32)
mask0 = tf.zeros((batch_size, n - len_), dtype=np.float32)
mask = tf.concat([mask1, mask0], axis=-1)

data = tf.multiply(data, mask[:, :, None])

num_segments = 1 + k # 几个聚类中心, 开始的1对应值0是给padding的(用mask0遮住的)
centers = data[:, :k, :] # bs, k, dim

# belong 代表每行数据中每个物品属于第几个聚类
last_belong = tf.zeros((data.shape[0], n), dtype=tf.int32)
belong = tf.ones((data.shape[0], n), dtype=tf.int32)

iter_n = 0

while tf.reduce_any(tf.not_equal(last_belong, belong)):
distance = tf.reduce_sum(
tf.math.square(
tf.tile(data[:, :, None, :], [1, 1, k, 1])
-
tf.tile(centers[:, None, :, :], [1, n, 1, 1])
),
axis=-1
)

last_belong = belong

belong = tf.argmin(distance, axis=-1, output_type=tf.dtypes.int32) + 1
belong = tf.multiply(belong, tf.cast(mask, belong.dtype))

num_rows = tf.shape(belong)[0]
rows_idx = tf.range(num_rows)
segment_ids_per_row = belong + num_segments * tf.expand_dims(rows_idx, axis=1)

centers = tf.math.unsorted_segment_mean(data, segment_ids_per_row, num_segments * num_rows)
centers = tf.reshape(centers, (num_rows, num_segments, data.shape[-1]))

centers = centers[:, 1:, :]

iter_n += 1

# print(belong)

belong = tf.stop_gradient(belong)

found = 0

for ni in range(1, k + 1):
local_mask = tf.equal(belong, ni)
found += batch_size - tf.math.reduce_sum(
tf.cast(tf.math.reduce_any(local_mask, axis=1), tf.int32),
axis=0
)

return found


total = 0
total_found = 0

while 1:
total += 1
total_found += f()
if total % 100 == 0:
print(total_found, total)

可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import tensorflow as tf
from tensorboard.plugins import projector
import pickle

with open('vector.pkl', 'rb') as f:
d = pickle.load(f)
d = d[:9, :]

log_dir = '/home/zx/workspace/tfb'

if not os.path.exists(log_dir):
os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
for i in range(d.shape[0]):
f.write(f"{i}\n")

weights = tf.Variable(initial_value=d)

# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

# tensorboard --host 0.0.0.0 --logdir /home/zx/workspace/tfb
# http://{your_ip}:6006/#projector