def layernorm_from_scratch(x, weight, bias, eps=1e-5): ...
"""heavy-ball: v = mu*v + g; theta -= lr*v。v 初值 0(首步 v=g),对齐 PyTorch SGD。""" ...