Mill Computing, Inc. Forums The Mill Architecture switches Reply To: switches

Veedrac
Participant
Post count: 25

I also tried my hand at this guy for Gold, and got


// ILP of 5⅓, mostly because of the silly load latency :P.
F("parseval") %str;
    load(%str, 0, b) %l0,
    load(%str, 1, b) %l1, // I'd fill this with loads if the two loops had the
    load(%str, 2, b) %l2; // same cycle length, but alas they do not.

    eqlb(%l0, 45) %neg,
    addp(%str, 1, b) %strplus1,
    pick(%neg, %strplus1, %str) %start,
    pick(%neg, %l1, %l0) %L0,
    pick(%neg, %l2, %l1) %L1;

    // Now we can finally start doing fun stuff.
    rd(b(0)) %zero,
    load(%start, 2, b) %char,     // Not loading early means we take a stall in hexmode,
    eqlb(%L0,   48) %L00,         // but loading early means decimalmode gets early loads. :(
    eqlb(%L1,  120) %L1x,
    addp(%start, 2) %startplus2,
    pick(%L00, %L1x, %L00) %hexmode,
    innertr(%hexmode, "parseval$hexmode", %zero, %startplus2, %neg),
    inner(%decimalmode, "parseval$decimalmode", %zero, %startplus2, %L1, %L0, %neg);

// ILP of 9! Cycle length equals loop-caried dependency.
L("parseval$hexmode") %total %str %neg;
    // a wild %char appears, right on time!
    load  (%str, b, 2) %char,     // Ideally this load would be fetching further ahead,
    gequ  (%char,  48) %digitlo,  // but we want one byte every 2 cycles, whereas the
    lequ  (%char,  57) %digithi,  // other branch want one every 3 cycles, so you can't
    gequ  (%char,  97) %letterlo, // hoist the setup into the header.
    lequ  (%char, 102) %letterhi,
    sub   (%char,  48) %digitval,
    sub   (%char,  87) %letterval,
    shiftl(%total,  4) %shifttot,
    addp  (%str, 1, b) %newstr,
    pick  (%digitlo,  %digithi,  %digitlo)  %digit,
    pick  (%letterlo, %letterhi, %letterlo) %letter;

    orl   (%digit,    %letter)    %inrange,
    add   (%shifttot, %digitval)  %digittot,
    add   (%shifttot, %letterval) %lettertot,
    pick  (%digit, %digittot, %lettertot) %newtotal,
    conform(%newtotal, %newstr, %neg, %total, %char),
    leavefl(%inrange, "parseval$finish", %total, %char, %neg),
    br("parseval$hexmode");

// ILP of only 4⅓. Cycle length equals loop-caried dependency.
// Not much one can do when the multiply is slow... :(
L("parseval$decimalmode") %total %str %newchar %char %neg;
    // A wild %newnewchar appears... six cycles early.
    // That's pretty lame, but fixing it take a branch.
    load  (%str, b, 3) %newnewchar,
    gequ  (%char,  48) %digitlo,
    lequ  (%char,  57) %digithi,
    addp  (%str, 1, b) %newstr,
    sub   (%char,  48) %digitval,
    shiftl(%total,  1) %totalx2,
    shiftl(%total,  3) %totalx8,
    pick  (%digitlo, %digithi,  %digitlo) %digit,
    leavefl(%digit, "parseval$finish", %total, %char, %neg);

    // This is a very sad instruction.
    add   (%totalx2,  %totalx8)  %totalx10;

    add   (%totalx10, %digitval) %newtotal,
    rescue(%newtotal, %newstr, %newnewchar, %newchar, %neg),
    br("parseval$hexmode");

// ILP of 4½. Meh, it works.
L("parseval$finish") %total %char %neg;
    // a wild %fakechar appears
    eqlb(%char, 75) %K,
    eqlb(%char, 77) %M,
    neg(%total) %negtotal,
    pick(%neg, %negtotal, %total) %totalsigned;

    shiftl(%totalsigned, 10) %totalK,
    shiftl(%totalsigned, 20) %totalM,
    pick(%K, %totalK, %totalsigned) %totalelse,
    retntr(%M, %totalM),
    retn(%totalelse);

Interesting architecture indeed. I’ve not handled the busywork belt numbers, but I’ve checked they’re in a consistent position. I expect one’s better off having a branch before entering either loop where you flood it with correctly-delayed loads ahead of time; that way you don’t risk stalls in the loop itself, but it adds a cycle (and a jump) to every call to parseval. Delayed loads don’t work because the loop count is different, and tagged loads don’t work because they don’t pipeline.

  • This reply was modified 7 years, 7 months ago by  Veedrac.