I’m trying to write a SHA-2 implementation, but the result keeps coming back incorrect- I’ve tested on such things as the empty string. I implemented it in two steps, preprocessing and primary body.
template<typename T> struct Output {
std::array<T, 8> h;
};
template<typename T> struct Input {
std::array<T, 16> c;
};
template<typename T> Output<T> sha2(Input<T> in) {
T w[64];
for(int i = 0; i < 16; i++)
w[i] = in.c[i];
for(int i = 16; i < 64; i++) {
auto s0 = _rotr(w[i - 15], 7) ^ _rotr(w[i - 15], 18) ^ (w[i - 15] >> 3);
auto s1 = _rotr(w[i - 2], 17) ^ _rotr(w[i - 2], 19) ^ (w[i - 2] >> 10);
w[i] = w[i - 16] + s0 + w[i - 7] + s1;
}
static const T k[] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static const T h[] = {
0x6a09e667,
0xbb67ae85,
0x3c6ef372,
0xa54ff53a,
0x510e527f,
0x9b05688c,
0x1f83d9ab,
0x5be0cd19
};
T loopvars[8];
for(int i = 0; i < 8; i++)
loopvars[i] = h[i];
for(int i = 0; i < 64; i++) {
auto&& la = loopvars[0];
auto&& lb = loopvars[1];
auto&& lc = loopvars[2];
auto&& ld = loopvars[3];
auto&& le = loopvars[4];
auto&& lf = loopvars[5];
auto&& lg = loopvars[6];
auto&& lh = loopvars[7];
auto s0 = _rotr(la, 2) ^ _rotr(la, 13) ^ _rotr(la, 22);
auto maj = (la & lb) ^ (la & lc) ^ (lb & lc);
auto t2 = s0 + maj;
auto s1 = _rotr(le, 6) ^ _rotr(le, 11) ^ _rotr(le, 25);
auto ch = (le & lf) ^ ((~le) & lg);
auto t1 = lh + s1 + ch + k[i] + w[i];
lh = lg;
lg = lf;
lf = le;
le = ld + t1;
ld = lc;
lc = lb;
lb = la;
la = t1 + t2;
}
Output<T> output;
for(int i = 0; i < 8; i++) {
output.h[i] = h[i] + loopvars[i];
}
return output;
}
Output<unsigned int> SHA2(std::vector<char> bytes) {
auto bitlen = bytes.size() * 8;
auto big_endian_bitlen = ::_byteswap_uint64(bitlen);
if (bitlen > 440)
throw std::runtime_error("Epic fail!");
Input<unsigned int> in;
for(int i = 0; i < 16; i++) {
in.c[i] = 0;
}
memcpy(&in.c[0], &bytes.front(), bytes.size());
in.c[bitlen / 32] |= (1 << (bitlen % 32));
// all zero by default, so no need to append the extra bits
in.c[14] = (big_endian_bitlen >> 32);
in.c[15] = big_endian_bitlen;
return sha2(in);
}
I suspect endianness error. For example, when I listed the input of the primary body, it came back as 1 .. (511x)0, which I’m pretty sure was correct. But when I tried swapping the values to respect endianness, I still did not get the correct output.
I’m fairly sure that the error is in the preprocessing step, as the primary body is endianness-independent, as far as I can tell.
Any suggestions as to where the implementation is incorrect?
Edit: Oh yes, _byteswap_uint64 is an MSVC intrinsic for endianness conversion of 64bit unsigned integer, and _rotr right rotates 32bit unsigned integers. For GCC, you just use macro or define them as functions to the equivalent GCC intrinsics.
Just for reference, the incorrect output is
de5c4195
c21e7e70
e6a365c2
77f6bc03
f651e23a
6fb9b88a
1decb688
d6fddf1f
whereas the correct output is
e3b0c442
98fc1c14
9afbf4c8
996fb924
27ae41e4
649b934c
a495991b
7852b855
Perhaps some extracts from a working implementation would be helpful — especially since it emphasizes following the FIPS description quite closely rather than any mundane considerations like efficiency. 🙂 Probably the biggest deviation is using temp[0]…temp[7] for what the FIPS calls
a,b, …hand temp[8] and temp[9] for T1 and T2.I’d tend to agree with your guess though: I’d suspect the padding routine. At least in my experience, the padding is harder to get correct than the hash routine itself (partly because it isn’t described as carefully).