Code: Select all
/////////////////////////////////////////////////////////////
// VFPU diggins
/////////////////
//
// Authors :
//
// hlide, Raphael
//
// 2006-11-17 01:05PM
//
/////////////////////////////////////////////////////////////
op operands ticks latency*
-----------------------------------------
mtv rt, vs.s
{
vs.s = rt; // rt is general purpose register
}
mfv rt, vs.s
{
rt = vs.s; // rt is general purpose register
}
-----------------------------------------
mtvc rt, vcr
{
vcr = rt; // vcr is cop2 control register
}
mfvc rt, vcr
{
rt = vcr; // vcr is cop2 control register
}
-----------------------------------------
vmtvc vcr, vs.s
{
vcr = vs.s;
}
vmfvc sd, cr
{
sd = cr;
}
-----------------------------------------
// rm is general purpose register containing a memory address
lv.s vd.s, offset(rm)
{
vd.s = offset(rm);
}
sv.s vd.s, offset(rm)
{
offset(rm) = vd.s;
}
// rm needs to be aligned to 16bytes (quadword)
lv.q vd, rm 1 0 (cache)
{ 68 (memory)
vd[0] = 0(rm);
vd[1] = 4(rm);
vd[2] = 8(rm);
vd[3] = 12(rm);
}
ulv.q vd, rm 2 0 (cache)
{ 68 (memory)
vd[0] = 0(rm);
vd[1] = 4(rm);
vd[2] = 8(rm);
vd[3] = 12(rm);
}
// rm needs to be aligned to 16bytes (quadword)
sv.q vd, rm 7 2 (cache)
{ 111 (memory)
0(rm) = vd[0];
4(rm) = vd[1];
8(rm) = vd[2];
12(rm) = vd[3];
}
usv.q vd, rm 14 4 (cache)
{ 111 (memory)
0(rm) = vd[0];
4(rm) = vd[1];
8(rm) = vd[2];
12(rm) = vd[3];
}
-----------------------------------------
// vector register prefixes
vpfxs [?0,?1,?2,?3]
// special prefix for vs like vs.q[X, X, Y, Y] - their values may be :
// x : vs[0]
// y : vs[1]
// z : vs[2]
// w : vs[3]
// -x : -vs[0]
// -y : -vs[1]
// -z : -vs[2]
// -w : -vs[3]
// |x| : |vs[0]| (absolute value of vs[0])
// |y| : |vs[1]| (absolute value of vs[1])
// |z| : |vs[2]| (absolute value of vs[2])
// |w| : |vs[3]| (absolute value of vs[3])
// 0 : constant 0
// 1 : constant 1
// 2 : constant 2
// 1/2 : constant 1/2
// 3 : constant 3
// 1/3 : constant 1/3
// 1/4 : constant 1/4
// 1/6 : constant 1/6
//
// so vmov.q vd, vs[z, |x|, 0, -x] :
// vd[0] = vs[3];
// vd[1] = |vs[0]|;
// vd[2] = 0;
// vd[3] = -vs[0];
vpfxt [?0,?1,?2,?3]
// special prefix for vt like vt.q[X, X, Y, Y] - their values may be :
// x : vt[0]
// y : vt[1]
// z : vt[2]
// w : vt[3]
// -x : -vt[0]
// -y : -vt[1]
// -z : -vt[2]
// -w : -vt[3]
// |x| : |vt[0]| (absolute value of vt[0])
// |y| : |vt[1]| (absolute value of vt[1])
// |z| : |vt[2]| (absolute value of vt[2])
// |w| : |vt[3]| (absolute value of vt[3])
// 0 : constant 0
// 1 : constant 1
// 2 : constant 2
// 1/2 : constant 1/2
// 3 : constant 3
// 1/3 : constant 1/3
// 1/4 : constant 1/4
// 1/6 : constant 1/6
//
vpfxd [?4,?5,?6,?7]
// special prefix for vd like vd.q[0:1, 0:1, 0:1, 0:1] - their values may be :
// 0:1 : min(1, max(0, vd[i]))
// -1:1 : min(1, max(-1, vd[i]))
// m : ???
//
// so vmov.p vd[0:1, -1:1], sd :
// vd[0] = min(1, max(0, vs[0]));
// vd[1] = min(1, max(-1, vs[1]));
-----------------------------------------
vadd.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] + vt[i];
}
vsub.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] - vt[i];
}
-----------------------------------------
vdiv.q/t/p/s vd, vs, vt 56/42/28/14 30/?/?/?
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] / vt[i];
}
vmul.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] * vt[i];
}
-----------------------------------------
vdot.q/t/p/s sd.s, vs, vt 1 0
{
sd.s = 0;
for (i = 0; i < |q/t/p/s|; ++i)
sd.s += vs[i] * vt[i];
}
-----------------------------------------
vscl.q/t/p/s vd, vs, vt.s 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] * vt.s;
}
-----------------------------------------
// Homogenuous dot product
vhdp.q/t/p/s vd.s, vs, vt (UNSURE) 1 0
{
vd.s = vt[|q/t/p|];
for (i = 0; i < |q/t/p|-1; ++i)
vd.s += vs[i] * vt[i];
}
-----------------------------------------
vcmp.q/t/p/s f2, vs, vt 1 0
{
for (i = 0; i < 5; ++i)
VFPU_CC[i] = 0;
VFPU_CC[5] = 1;
for (i = 0; i < |q/t/p|; ++i)
VFPU_CC[i] = bcmp(f2, vs[i], vt[i]); // f2 = EQ/NE/LE/LT/GE/GT
for (i = 0; i < |q/t/p|; ++i)
{
VFPU_CC[4] ||= VFPU_CC[i];
VFPU_CC[5] &&= VFPU_CC[i];
}
}
vcmp.q/t/p/s f1, vs 1 0
{
for (i = 0; i < 5; ++i)
VFPU_CC[i] = 0;
VFPU_CC[5] = 1;
for (i = 0; i < |q/t/p|; ++i)
VFPU_CC[i] = ucmp(f1, vs[i]); // f1 = EN/EI/EZ/ES/NN/NI/NZ/NS
for (i = 0; i < |q/t/p|; ++i)
{
VFPU_CC[4] ||= VFPU_CC[i];
VFPU_CC[5] &&= VFPU_CC[i];
}
}
vcmp.q/t/p/s f0
{
for (i = 0; i < 5; ++i)
VFPU_CC[i] = 0;
VFPU_CC[5] = 1;
for (i = 0; i < |q/t/p|; ++i)
VFPU_CC[i] = f0; // f0 = TR/FL
for (i = 0; i < |q/t/p|; ++i)
{
VFPU_CC[4] ||= VFPU_CC[i];
VFPU_CC[5] &&= VFPU_CC[i];
}
}
-----------------------------------------
vmin.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = min(vs[i], vt[i]);
}
vmax.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = max(vs[i], vt[i]);
}
-----------------------------------------
vsgn.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vs[i] < 0.0) ? -1.0 : (vs[i] > 0.0) : 1.0 : 0.0;
}
-----------------------------------------
vcst.q/t/p/s vd, VPFU_SPC_CST 1 0
{
// VFPU_HUGE = Inf
// VFPU_SQRT2 = SQRT(2)
// VFPU_SQRT1_2 = SQRT(1/2)
// VFPU_2_SQRTPI = 2/SQRT(PI)
// VFPU_2_PI = 2/PI
// VFPU_1_PI = 1/PI
// VFPU_PI_4 = PI/4
// VFPU_PI_2 = PI/2
// VFPU_PI = PI
// VFPU_E = e
// VFPU_LOG2E = log2(e)
// VFPU_LOG10E = log10(e)
// VFPU_LN2 = ln(2)
// VFPU_LN10 = ln(10)
// VFPU_2PI = 2*PI
// VFPU_PI_6 = PI/6
// VFPU_LOG10TWO = log10(2)
// VFPU_LOG2TEN = log2(10)
// VFPU_SQRT3_2 = sqrt(3)/2
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vpfu_special_constant[VPFU_SPC_CST]
}
-----------------------------------------
vscmp.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vs[i] < vt[i]) ? -1.0 : (vs[i] > vt[i]) ? 1.0 : 0.0;
}
vsge.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vs[i] >= vt[i]) ? 1.0 : 0.0;
}
vslt.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vs[i] < vt[i]) ? 1.0 : 0.0;
}
-----------------------------------------
vi2uc.q vd.s, vs.q 1 0
{
vd.s[0]( 0.. 7) = vs.q[0] & 0xFF;
vd.s[0]( 8..15) = vs.q[1] & 0xFF;
vd.s[0](16..23) = vs.q[2] & 0xFF;
vd.s[0](24..31) = vs.q[3] & 0xFF;
}
vi2c.q vd.s, vs.q 1 0
{
vd.s[0]( 0.. 7) = (vs.q[0] & 0x7F) | ((vs.q[0] & 0x80000000) >> 24);
vd.s[0]( 8..15) = (vs.q[1] & 0x7F) | ((vs.q[1] & 0x80000000) >> 24);
vd.s[0](16..23) = (vs.q[2] & 0x7F) | ((vs.q[2] & 0x80000000) >> 24);
vd.s[0](24..31) = (vs.q[3] & 0x7F) | ((vs.q[3] & 0x80000000) >> 24);
}
-----------------------------------------
vmov.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i];
}
-----------------------------------------
vabs.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = |vs[i]|;
}
-----------------------------------------
vneg.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = -vs[i];
}
-----------------------------------------
vsat0.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = max(0.0, min(vs[i], 1.0));
}
vsat1.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = max(-1.0, min(vs[i], 1.0));
}
-----------------------------------------
vzero.q/t/p/s vd 3/?/?/? 2
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 0.0;
}
vone.q/t/p/s vd 3/?/?/? 2
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0;
}
vidt.q/t/p/s vd 3/?/?/? 2
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vd[i].column == vd[i].row) ? 1.0 : 0.0;
}
-----------------------------------------
vrcp.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0 / vs[i];
}
vrsq.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0 / sqrt(vs[i]);
}
-----------------------------------------
vsin.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = sin(vs[i]*PI/2);
}
vcos.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = cos(vs[i]*PI/2);
}
vasin.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = asin(vs[i]) * 2/PI; // not sure about this conversion
}
-----------------------------------------
vexp2.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = exp2(vs[i]);
}
vlog2.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = log2(vs[i]);
}
-----------------------------------------
vsqrt.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = sqrt(vs[i]);
}
-----------------------------------------
vrnds.s vs ? ?
{
random_seed(vs);
}
-----------------------------------------
vrndi.q/t/p/s vd 12/9/6/3 10/7/4/1
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = rand_integer(-1<<31, 1<<31); // -1<<31 <= vd[i] < 1<<31
}
-----------------------------------------
vrndf1.q/t/p/s vd 12/9/6/3 10/7/4/1
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = rand_float(0.0, 2.0); // 0.0 <= vd[i] < 2.0
}
-----------------------------------------
vrndf2.q/t/p/s vd 12/9/6/3 10/7/4/1
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = rand_float(0.0, 4.0); // 0.0 <= vd[i] < 4.0
}
-----------------------------------------
// Nvidia Half format [S:1][E:5][M:10]
vf2h.p/q vd, vs (UNSURE) 1 0
{
for (i = 0; i < |q/p|/2; ++i)
vd[i]( 0..15) = ((vs[i*2] >> 16) & 0x8000) | ((vs[i*2] >> 13) & 0x03FF);
e = ((vs[i*2] >> 23) & 0xFF) - 0x70;
if (e < 0)
e = 0;
if (e > 31)
e = 31;
vd[i] &= ~0x03FF; // -> make too huge numbers infinity
if ((vs[i*2] & 0x7FFFFF != 0) && ((vs[i*2] >> 23) & 0xFF == 0xFF))
vd[i] |= 0x03FF; // -> But NaNs stay NaNs even with mantissa loss
vd[i] |= (e << 10);
vd[i](16..31) = ((vs[i*2+1] >> 16) & 0x8000) | ((vs[i*2+1] >> 13) & 0x03FF);
e = ((vs[i*2+1] >> 23) & 0xFF) - 0x70;
if (e < 0)
e = 0;
if (e > 31)
e = 31;
vd[i] &= ~0x03FF0000; // -> make too huge numbers infinity
if ((vs[i*2+1] & 0x7FFFFF != 0) && ((vs[i*2+1] >> 23) & 0xFF == 0xFF))
vd[i] |= 0x03FF0000; // -> But NaNs stay NaNs even with mantissa loss
vd[i] |= (e << 26);
}
-----------------------------------------
vsrt1.q vd, vs 1 0
{
vd[0] = min(vs[0], vs[1]);
vd[1] = max(vs[1], vs[0]);
vd[2] = min(vs[2], vs[3]);
vd[3] = max(vs[3], vs[2]);
}
vsrt2.q vd, vs 1 0
{
vd[0] = min(vs[0], vs[3]);
vd[1] = max(vs[1], vs[2]);
vd[2] = min(vs[2], vs[1]);
vd[3] = max(vs[3], vs[0]);
}
vsrt3.q vd, vs 1 0
{
vd[0] = max(vs[0], vs[1]);
vd[1] = min(vs[1], vs[0]);
vd[2] = max(vs[2], vs[3]);
vd[3] = min(vs[3], vs[2]);
}
vsrt4.q vd, vs 1 0
{
vd[0] = max(vs[0], vs[3]);
vd[1] = max(vs[1], vs[2]);
vd[2] = min(vs[2], vs[1]);
vd[3] = min(vs[3], vs[0]);
}
-----------------------------------------
vbfy1.q/p vd, vs 1 0
{
for (i = 0; i < |q/p|; i += 2)
vd[i+0] = vs[i+0] + vs[i+1];
vd[i+1] = vs[i+0] - vs[i+1];
}
vbfy2.q vd, vs 1 0
{
vd[0] = vs[0] + vs[2];
vd[1] = vs[1] + vs[3];
vd[2] = vs[0] - vs[2];
vd[3] = vs[1] - vs[3];
}
-----------------------------------------
vocp.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0 - vs[i];
}
-----------------------------------------
// Funnel add components
vfad.q/t/p/s vd.s, vs 1 0
{
vd.s = 0;
for (i = 0; i < |q/t/p/s|; ++i)
vd.s += vs[i];
}
-----------------------------------------
// Average of components
vavg.q/t/p/s vd.s, vs 1 0
{
vd.s = 0.0
for (i = 0; i < |q/t/p/s|; ++i)
vd.s += vs[i];
vd.s /= |q/t/p/s|;
}
-----------------------------------------
// Round
vf2in.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = ROUND(vs[i]) << imm;
}
-----------------------------------------
// Trunc
vf2iz.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = TRUNC(vs[i]) << imm;
}
-----------------------------------------
// Floor
vf2iu.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = FLOOR(vs[i]) << imm;
}
-----------------------------------------
// Ceil
vf2id.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = CEIL(vs[i]) << imm;
}
-----------------------------------------
// (float)
vi2f.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (float)(vs[i]) / (float)(1<<imm);
}
-----------------------------------------
// Conditional move vector on true
vcmovt.q/t/p/s vd, vs, cc (UNSURE) 5 4
{
switch (cc)
{
case 0...5 :
if (CC[cc] == TRUE)
vd = vs;
case 6:
for (i = 0; i < |q/t/p/s|; ++i)
if (CC[i] == TRUE)
vd[i] = vs[i]
}
}
// Conditional move vector on false
vcmovf.q/t/p/s vd, vs, cc (UNSURE) 5 4
{
switch (cc)
{
case 0...5 :
if (CC[cc] == FALSE)
vd = vs;
case 6:
for (i = 0; i < |q/t/p/s|; ++i)
if (CC[i] == FALSE)
vd[i] = vs[i]
}
}
-----------------------------------------
// Matrix multiplication
vmmul.q/t/p md, ms, mt 16/8/4 15/7/3
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = 0;
for (k = 0; k < |q/t/p|; ++k)
md[i][j] += ms[i][k] * mt[k][j];
}
-----------------------------------------
// Matrix-vector transform
vtfm4.q/3.t/2.p vd, md, vt 4/3/2 3/2/1
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = 0;
for (j = 0; j < |q/t/p|; ++j)
vd[i] += md[i][j] * vt[j];
}
-----------------------------------------
// Homogenous transform
vhtfm4.q/3.t/2.p vd, md, vt 4/3/2 3/2/1
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = 0;
for (j = 0; j < |q/t/p|; ++j)
vd[i] += md[i][j] * vt[j];
for (i = 0; i < |q/t/p|; ++i)
vd[i] /= vd[|q/t/p|];
}
-----------------------------------------
// Matrix scale
vmscl.q/t/p md, ms, vt.s 4/3/2 3/2/1
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = ms[i][j] * vt.s;
}
-----------------------------------------
// Quaternion multiply
vqmul.q vd, vs, vt 4 3
{
vd[0] = vs[3] * vt[0] + vs[0] * vt[3] + vs[1] * vt[2] - vs[2] * vt[1];
vd[1] = vs[3] * vt[1] + vs[1] * vt[3] + vs[2] * vt[0] - vs[0] * vt[2];
vd[2] = vs[3] * vt[2] + vs[2] * vt[3] + vs[0] * vt[1] - vs[1] * vt[0];
vd[3] = vs[3] * vt[3] - vs[0] * vt[0] - vs[1] * vt[1] - vs[2] * vt[2];
}
-----------------------------------------
// Matrix move
vmmov.q/t/p md, ms 4/3/2 3/2/1
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = ms[i][j];
}
-----------------------------------------
// Matrix Identity
vmidt.q/t/p md 6/5/4 5/4/3
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = (i == j) ? 1.0 : 0.0;
}
-----------------------------------------
// Matrix-zero
vmzero.q/t/p md 6/5/4 5/4/3
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = 0.0;
}
-----------------------------------------
// Matrix-one
vmone.q/t/p md 6/5/4 5/4/3
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = 1.0;
}
-----------------------------------------
// Rotation vector
vrot.q/t/p vd, vs.s, [+c/-c/-s/+s/0,...] 2 1
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = (+1.0 | -1.0) * (cos | sin)(vs.s*PI/2.0) | 0;
}
-----------------------------------------
vt4444.q vd, vs 1 0
{
vd[0]( 0..15) = ((vs[0] & 0xF0000000) >> 16) | ((vs[0] & 0xF00000) >> 12) | ((vs[0] & 0xF000) >> 8) | ((vs[0] & 0xF0) >> 4);
vd[0](16..31) = ((vs[1] & 0xF0000000) >> 16) | ((vs[1] & 0xF00000) >> 12) | ((vs[1] & 0xF000) >> 8) | ((vs[1] & 0xF0) >> 4);
vd[1]( 0..15) = ((vs[2] & 0xF0000000) >> 16) | ((vs[2] & 0xF00000) >> 12) | ((vs[2] & 0xF000) >> 8) | ((vs[2] & 0xF0) >> 4);
vd[1](16..31) = ((vs[3] & 0xF0000000) >> 16) | ((vs[3] & 0xF00000) >> 12) | ((vs[3] & 0xF000) >> 8) | ((vs[3] & 0xF0) >> 4);
}
-----------------------------------------
vt5551.q vd, vs 1 0
{
vd[0]( 0..15) = ((vs[0] & 0x80000000) >> 16) | ((vs[0] & 0xF80000) >> 9) | ((vs[0] & 0xF800) >> 6) | ((vs[0] & 0xF8) >> 3);
vd[0](16..31) = ((vs[1] & 0x80000000) >> 16) | ((vs[1] & 0xF80000) >> 9) | ((vs[1] & 0xF800) >> 6) | ((vs[1] & 0xF8) >> 3);
vd[1]( 0..15) = ((vs[2] & 0x80000000) >> 16) | ((vs[2] & 0xF80000) >> 9) | ((vs[2] & 0xF800) >> 6) | ((vs[2] & 0xF8) >> 3);
vd[1](16..31) = ((vs[3] & 0x80000000) >> 16) | ((vs[3] & 0xF80000) >> 9) | ((vs[3] & 0xF800) >> 6) | ((vs[3] & 0xF8) >> 3);
}
-----------------------------------------
vt5650.q vd, vs 1 0
{
vd[0]( 0..15) = ((vs[0] & 0xF80000) >> 8) | ((vs[0] & 0xFC00) >> 5) | ((vs[0] & 0xF8) >> 3);
vd[0](16..31) = ((vs[1] & 0xF80000) >> 8) | ((vs[1] & 0xFC00) >> 5) | ((vs[1] & 0xF8) >> 3);
vd[1]( 0..15) = ((vs[2] & 0xF80000) >> 8) | ((vs[2] & 0xFC00) >> 5) | ((vs[2] & 0xF8) >> 3);
vd[1](16..31) = ((vs[3] & 0xF80000) >> 8) | ((vs[3] & 0xFC00) >> 5) | ((vs[3] & 0xF8) >> 3);
}
-----------------------------------------
vcrs.t vd, vs, vt 1 0
{
vd[0] = vs[1] * vt[2];
vd[1] = vs[2] * vt[0];
vd[2] = vs[0] * vt[1];
}
-----------------------------------------
// Negative reciprocal
vnrcp.q/t/p/s vd, vs (UNSURE) 4/?/?/? 3
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = -1.0 / vs[i];
}
-----------------------------------------
// Negative sinus
vnsin.q/t/p/s vd, vs (UNSURE) 4/?/?/? 3
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = -sin(vs[i]*PI/2);
}
-----------------------------------------
// Reciprocal exponent to base 2
vrexp2.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0 / exp2(vs[i]);
}
-----------------------------------------
// Vector cross-product
vcrsp.t vd, vs, vt 3 2
{
vd[0] = vs[1]*vt[2] - vs[2]*vt[1];
vd[1] = vs[2]*vt[0] - vs[0]*vt[2];
vd[2] = vs[0]*vt[1] - vs[1]*vt[0];
}
-----------------------------------------
// Vector determinant
vdet.p vd.s, vs, vt 1 0
{
vd.s = vs[0] * vt[1] - vs[1] * vt[0];
}
-----------------------------------------
v(u)s2i.s vd.p, vs.s 1 0
{
vd.p[0] = (vs.s[0](16..31)) << 16;
vd.p[1] = (vs.s[0]( 0..15)) << 16;
}
v(u)s2i.p vd.q, vs.p 1 0
{
vd.q[0] = (vs.p[0](16..31)) << 16;
vd.q[1] = (vs.p[0]( 0..15)) << 16;
vd.q[2] = (vs.p[1](16..31)) << 16;
vd.q[3] = (vs.p[1]( 0..15)) << 16;
}
-----------------------------------------
vi2(u)s.s vd.s, vs.p 1 0
{
vd.s[0](16..31) = vs.p[0] >> 16;
vd.s[0]( 0..15) = vs.p[1] >> 16;
}
vi2(u)s.p vd.p, vs.q 1 0
{
vd.p[0](16..31) = vs.q[0] >> 16;
vd.p[0]( 0..15) = vs.q[1] >> 16;
vd.p[1](16..31) = vs.q[2] >> 16;
vd.p[1]( 0..15) = vs.q[3] >> 16;
}
-----------------------------------------
// Nvidia Half format [S:1][E:5][M:10]
vh2f.p vd, vs 1 0
{
vd[0] = ((vs[0] & 0x8000) << 16) | ((((vs[0] >> 10) & 0x1F) + 0x70) << 23) | ((vs[0] & 0x03FF) << 13);
vd[1] = (vs[0] & 0x80000000) | ((((vs[0] >> 10) & 0x1F0000) + 0x700000) << 7) | ((vs[0] & 0x03FF0000) >> 3);
vd[2] = ((vs[1] & 0x8000) << 16) | ((((vs[1] >> 10) & 0x1F) + 0x70) << 23) | ((vs[1] & 0x03FF) << 13);
vd[3] = (vs[1] & 0x80000000) | ((((vs[1] >> 10) & 0x1F0000) + 0x700000) << 7) | ((vs[1] & 0x03FF0000) >> 3);
}
-----------------------------------------
vsocp.p/s vd.q/p, vs.p/s 1 0
{
for (i = 0; i < |p/s|; ++i)
vd[i*2+0] = 1.0 - vs[i];
vd[i*2+1] = vs[i];
}
-----------------------------------------
vsbz.s vd.s, vs.s 1 0
{
// TODO Byte To Short Extension ?
}
vsbn.s vd.s, vs.s, vt.s 1 0
{
// TODO Byte to Short Extension ?
}
vlgb.s vd.s, vs.s 1 0
{
// TODO
}
vwbn.s vd.s, vs.s, imm 1 0
{
// TODO Byte to Word Extension ?
}
-----------------------------------------
viim.s vd.s, constant integer 1 0
{
vd.s = constant integer (between -32768 and 32767 ?);
}
vfim.s vd.s, constant real 1 0
{
vd.s = constant real;
}
-----------------------------------------
vnop 1 0
{
// do nothing except eating 1 cycle
}
-----------------------------------------
vflush 5 4
{
// TODO
}
vsync 4 3
{
// TODO
}
vsync i 1 0
{
// TODO
}
NOTES:
(UNSURE) besides an op means the given C counterpart is questionable
Clock ticks are benched estimates, but should be accurate.
*The latency column is to be understood like this:
the exec cost is the (clock) ticks minus the latency and is unavoidable cost, while latency is the 'playroom' to interleave
the code with other (independant) ops without additional costs.
Unfortunately, this does not seem to work with VFPU ops - so either the VFPU isn't pipelined or most ops with latency
just use the whole pipeline already. It works however with normal mips code (that's how it was benched). This code
interleaving is recommended especially with matrix and other costly ops.