Mill Computing, Inc. › Forums › The Mill › Architecture › Speculative execution › Reply To: Speculative execution
For the intra case.
Get the content of this block.
[XXXX]
[XXXX]
[XXXX]
[XXXX]
From a vector of pixels.
[XXXXXXXX]
// Pass every row.
int angle_sum = 0;
for (int y = 0; y < bs; y++)
{
angle_sum += angle;
int off = angle_sum>>5;
int frac = angle_sum&31;
// Interpolate.
if (frac)
for (int x = 0; x < bs; x++)
dst[y*bs+x] = ((32-frac)*ref[off+x] + frac*ref[off+x+1] + 16)>>5;
// Copy.
else
for (int x = 0; x < bs; x++)
dst[y*bs+x] = ref[off+x];
}
For the SAD case.
int sad = 0;
for (int y = 0; y < height; y++, src0 += stride0, src1 += stride1)
for (int x = 0; x < width; x++)
sad += F265_ABS(src0[x] - src1[x]);
return sad;
For the DCT case.
void fenc_dct_8_1d(int16_t *dst, int16_t *src, int shift)
{
int add = 1 << (shift - 1);
for (int i = 0; i < 8; i++, dst++, src += 8)
{
int add_0_7 = src[0]+src[7], add_1_6 = src[1]+src[6], add_2_5 = src[2]+src[5], add_3_4 = src[3]+src[4];
int sub_0_7 = src[0]-src[7], sub_1_6 = src[1]-src[6], sub_2_5 = src[2]-src[5], sub_3_4 = src[3]-src[4];
dst[0] = (64*add_0_7 + 64*add_1_6 + 64*add_2_5 + 64*add_3_4 + add) >> shift;
dst[8] = (89*sub_0_7 + 75*sub_1_6 + 50*sub_2_5 + 18*sub_3_4 + add) >> shift;
dst[16] = (83*add_0_7 + 36*add_1_6 - 36*add_2_5 - 83*add_3_4 + add) >> shift;
dst[24] = (75*sub_0_7 - 18*sub_1_6 - 89*sub_2_5 - 50*sub_3_4 + add) >> shift;
dst[32] = (64*add_0_7 - 64*add_1_6 - 64*add_2_5 + 64*add_3_4 + add) >> shift;
dst[40] = (50*sub_0_7 - 89*sub_1_6 + 18*sub_2_5 + 75*sub_3_4 + add) >> shift;
dst[48] = (36*add_0_7 - 83*add_1_6 + 83*add_2_5 - 36*add_3_4 + add) >> shift;
dst[56] = (18*sub_0_7 - 50*sub_1_6 + 75*sub_2_5 - 89*sub_3_4 + add) >> shift;
}
}
// This function is the assembly function.
void fenc_dct_8_c(int16_t *dst, f265_pix *src, int src_stride, f265_pix *pred, int pred_stride)
{
int lg_bs = 3, bd = 8;
int bs = 1<<lg_bs, bs2 = 1<<(lg_bs<<1);
int shift1 = lg_bs + bd - 9, shift2 = lg_bs + 6;
int16_t diff[bs2], tmp[bs2];
fenc_get_block_residual(diff, src, src_stride, pred, pred_stride, bs);
fenc_dct_8_1d(tmp, diff, shift1);
fenc_dct_8_1d(dst, tmp, shift2);
}