1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
| def build_hash_vectorized(self, vec_val, vec_tmp1, vec_tmp2, round, i, hash_vec_consts):
"""
向量化的哈希函数
vec_val: 输入/输出向量
vec_tmp1, vec_tmp2: 向量临时变量
hash_vec_consts: 预分配的哈希常量字典
"""
slots = []
for hi in range(len(HASH_STAGES)):
# 从预分配的常量中获取
const_info = hash_vec_consts[hi]
vec_const1 = const_info['vec_const1']
vec_const3 = const_info['vec_const3']
op1 = const_info['op1']
op2 = const_info['op2']
op3 = const_info['op3']
# 向量运算
slots.append(("valu", (op1, vec_tmp1, vec_val, vec_const1)))
slots.append(("valu", (op3, vec_tmp2, vec_val, vec_const3)))
slots.append(("valu", (op2, vec_val, vec_tmp1, vec_tmp2)))
# Debug验证
for vi in range(8):
slots.append(("debug", ("compare", vec_val + vi, (round, i + vi, "hash_stage", hi))))
return slots
def build_kernel(
self, forest_height: int, n_nodes: int, batch_size: int, rounds: int
):
"""
Like reference_kernel2 but building actual instructions.
Scalar implementation using only scalar ALU and load/store.
"""
tmp1 = self.alloc_scratch("tmp1")
tmp2 = self.alloc_scratch("tmp2")
tmp3 = self.alloc_scratch("tmp3")
# Scratch space addresses
init_vars = [
"rounds",
"n_nodes",
"batch_size",
"forest_height",
"forest_values_p",
"inp_indices_p",
"inp_values_p",
]
for v in init_vars:
self.alloc_scratch(v, 1)
for i, v in enumerate(init_vars):
self.add("load", ("const", tmp1, i))
self.add("load", ("load", self.scratch[v], tmp1))
zero_const = self.scratch_const(0)
one_const = self.scratch_const(1)
two_const = self.scratch_const(2)
# Pause instructions are matched up with yield statements in the reference
# kernel to let you debug at intermediate steps. The testing harness in this
# file requires these match up to the reference kernel's yields, but the
# submission harness ignores them.
self.add("flow", ("pause",))
# Any debug engine instruction is ignored by the submission simulator
self.add("debug", ("comment", "Starting loop"))
body = [] # array of slots
# 标量变量(用于地址计算)
tmp_addr = self.alloc_scratch("tmp_addr_scalar")
# 在循环前分配向量变量(每个占8个地址)
vec_idx = self.alloc_scratch("vec_idx", length=8)
vec_val = self.alloc_scratch("vec_val", length=8)
vec_node_val = self.alloc_scratch("vec_node_val", length=8)
vec_addr = self.alloc_scratch("vec_addr", length=8)
# 在循环前分配向量临时变量用于哈希(每个占8个地址)
vec_tmp1 = self.alloc_scratch("vec_tmp1", length=8)
vec_tmp2 = self.alloc_scratch("vec_tmp2", length=8)
vec_tmp3 = self.alloc_scratch("vec_tmp3", length=8)
# 向量变量(用于节点值加载)
vec_forest_p = self.alloc_scratch("vec_forest_p", length=8)
body.append(("valu", ("vbroadcast", vec_forest_p, self.scratch["forest_values_p"])))
# 向量常量(用于索引更新)
vec_two = self.alloc_scratch("vec_two", length=8)
vec_zero = self.alloc_scratch("vec_zero", length=8)
vec_one = self.alloc_scratch("vec_one", length=8)
# 广播常量
body.append(("valu", ("vbroadcast", vec_two, two_const)))
body.append(("valu", ("vbroadcast", vec_zero, zero_const)))
body.append(("valu", ("vbroadcast", vec_one, one_const)))
# 向量变量(用于边界检查)
vec_n_nodes = self.alloc_scratch("vec_n_nodes", length=8)
body.append(("valu", ("vbroadcast", vec_n_nodes, self.scratch["n_nodes"])))
# 预分配哈希函数需要的向量常量
hash_vec_consts = {}
for hi, (op1, val1, op2, op3, val3) in enumerate(HASH_STAGES):
# 为每个阶段的两个常量分配向量空间
vec_const1 = self.alloc_scratch(f"hash_c1_{hi}", length=8)
vec_const3 = self.alloc_scratch(f"hash_c3_{hi}", length=8)
# 获取标量常量
const1_scalar = self.scratch_const(val1)
const3_scalar = self.scratch_const(val3)
# 广播到向量
body.append(("valu", ("vbroadcast", vec_const1, const1_scalar)))
body.append(("valu", ("vbroadcast", vec_const3, const3_scalar)))
# 存储
hash_vec_consts[hi] = {
'vec_const1': vec_const1,
'vec_const3': vec_const3,
'op1': op1,
'op2': op2,
'op3': op3,
}
for round in range(rounds):
for i in range(0, batch_size, VLEN):
i_const = self.scratch_const(i)
# 1.索引加载
# 计算起始地址
body.append(("alu", ("+", tmp_addr, self.scratch["inp_indices_p"], i_const)))
# 向量加载
# vec_idx[0..7] = mem[inp_indices_p+i : inp_indices_p+i+8]
body.append(("load", ("vload", vec_idx, tmp_addr)))
for vi in range(8):
body.append(("debug", ("compare", vec_idx + vi, (round, i + vi, "idx"))))
# 2.值加载
body.append(("alu", ("+", tmp_addr, self.scratch["inp_values_p"], i_const)))
# vec_val[0..7] = mem[inp_values_p+i : inp_values_p+i+8]
body.append(("load", ("vload", vec_val, tmp_addr)))
for vi in range(8):
body.append(("debug", ("compare", vec_val + vi, (round, i + vi, "val"))))
# 3.节点值加载(不连续)
# vec_addr[i] = forest_values_p + vec_idx[i]
body.append(("valu", ("+", vec_addr, vec_forest_p, vec_idx)))
for offset in range(VLEN):
body.append(("load", ("load_offset", vec_node_val, vec_addr, offset))) # 这里可以VLIW并行加载
for vi in range(8):
body.append(("debug", ("compare", vec_node_val + vi, (round, i + vi, "node_val"))))
# 4.向量XOR:vec_val[i] = vec_val[i] ^ vec_node_val[i]
body.append(("valu", ("^", vec_val, vec_val, vec_node_val)))
# 5.哈希
body.extend(self.build_hash_vectorized(vec_val, vec_tmp1, vec_tmp2, round, i, hash_vec_consts))
for vi in range(8):
body.append(("debug", ("compare", vec_val + vi, (round, i + vi, "hashed_val"))))
# 6.索引更新
# vec_tmp1 = vec_val % 2
body.append(("valu", ("%", vec_tmp1, vec_val, vec_two)))
# vec_tmp1 = (vec_tmp1 == 0)
body.append(("valu", ("==", vec_tmp1, vec_tmp1, vec_zero)))
# vec_tmp3 = 1 if even else 2
body.append(("flow", ("vselect", vec_tmp3, vec_tmp1, vec_one, vec_two)))
# vec_idx = vec_idx * 2
body.append(("valu", ("*", vec_idx, vec_idx, vec_two)))
# vec_idx = vec_idx + vec_tmp3
body.append(("valu", ("+", vec_idx, vec_idx, vec_tmp3)))
for vi in range(8):
body.append(("debug", ("compare", vec_idx + vi, (round, i + vi, "next_idx"))))
# 7.边界检查
# vec_tmp1 = (vec_idx < n_nodes)
body.append(("valu", ("<", vec_tmp1, vec_idx, vec_n_nodes)))
# vec_idx = vec_idx if in_range else 0
body.append(("flow", ("vselect", vec_idx, vec_tmp1, vec_idx, vec_zero)))
for vi in range(8):
body.append(("debug", ("compare", vec_idx + vi, (round, i + vi, "wrapped_idx"))))
# 8.存储结果
# 存储索引
# 计算起始地址
body.append(("alu", ("+", tmp_addr, self.scratch["inp_indices_p"], i_const)))
# 向量存储
body.append(("store", ("vstore", tmp_addr, vec_idx)))
# 存储值
body.append(("alu", ("+", tmp_addr, self.scratch["inp_values_p"], i_const)))
body.append(("store", ("vstore", tmp_addr, vec_val)))
body_instrs = self.build(body)
self.instrs.extend(body_instrs)
# Required to match with the yield in reference_kernel2
self.instrs.append({"flow": [("pause",)]})
|