Mill Computing, Inc. Forums The Mill Architecture Speculative execution Reply To: Speculative execution

Laurent_Birtz
Member
Post count: 10

For the intra case.

Get the content of this block.


[XXXX]
[XXXX]
[XXXX]
[XXXX]

From a vector of pixels.
[XXXXXXXX]


// Pass every row.
int angle_sum = 0;
for (int y = 0; y < bs; y++)
{
    angle_sum += angle;
    int off = angle_sum>>5;
    int frac = angle_sum&31;

    // Interpolate.
    if (frac)
        for (int x = 0; x < bs; x++)
            dst[y*bs+x] = ((32-frac)*ref[off+x] + frac*ref[off+x+1] + 16)>>5;

    // Copy.
    else
        for (int x = 0; x < bs; x++)
            dst[y*bs+x] = ref[off+x];
}

For the SAD case.


int sad = 0;
for (int y = 0; y < height; y++, src0 += stride0, src1 += stride1)
    for (int x = 0; x < width; x++)
        sad += F265_ABS(src0[x] - src1[x]);
return sad;

For the DCT case.


void fenc_dct_8_1d(int16_t *dst, int16_t *src, int shift)
{
    int add = 1 << (shift - 1);

    for (int i = 0; i < 8; i++, dst++, src += 8)
    {
        int add_0_7 = src[0]+src[7], add_1_6 = src[1]+src[6], add_2_5 = src[2]+src[5], add_3_4 = src[3]+src[4];
        int sub_0_7 = src[0]-src[7], sub_1_6 = src[1]-src[6], sub_2_5 = src[2]-src[5], sub_3_4 = src[3]-src[4];

        dst[0]  = (64*add_0_7 + 64*add_1_6 + 64*add_2_5 + 64*add_3_4 + add) >> shift;
        dst[8]  = (89*sub_0_7 + 75*sub_1_6 + 50*sub_2_5 + 18*sub_3_4 + add) >> shift;
        dst[16] = (83*add_0_7 + 36*add_1_6 - 36*add_2_5 - 83*add_3_4 + add) >> shift;
        dst[24] = (75*sub_0_7 - 18*sub_1_6 - 89*sub_2_5 - 50*sub_3_4 + add) >> shift;
        dst[32] = (64*add_0_7 - 64*add_1_6 - 64*add_2_5 + 64*add_3_4 + add) >> shift;
        dst[40] = (50*sub_0_7 - 89*sub_1_6 + 18*sub_2_5 + 75*sub_3_4 + add) >> shift;
        dst[48] = (36*add_0_7 - 83*add_1_6 + 83*add_2_5 - 36*add_3_4 + add) >> shift;
        dst[56] = (18*sub_0_7 - 50*sub_1_6 + 75*sub_2_5 - 89*sub_3_4 + add) >> shift;
    }
}

// This function is the assembly function.
void fenc_dct_8_c(int16_t *dst, f265_pix *src, int src_stride, f265_pix *pred, int pred_stride)
{
    int lg_bs = 3, bd = 8;
    int bs = 1<<lg_bs, bs2 = 1<<(lg_bs<<1);
    int shift1 = lg_bs + bd - 9, shift2 = lg_bs + 6;
    int16_t diff[bs2], tmp[bs2];
    fenc_get_block_residual(diff, src, src_stride, pred, pred_stride, bs);
    fenc_dct_8_1d(tmp, diff, shift1);
    fenc_dct_8_1d(dst, tmp, shift2);
}