Need help with vfpu inline assembler

Raphael · Post by **Raphael** » Sat Jun 10, 2006 1:11 am

Hi there, I hope someone is able to help me with my first try on vfpu code.

What I have is a function that performs a 8x8 iDCT (theoretically) by doing a 8x8 matrix multiply. Since the vfpu only handles up to 4x4 matrices I split this up to 8 4x4 mults and 4 matrix adds.
However my code won't compile giving the following error(s):

$ make
psp-gcc -I.. -IE:/cygwin/usr/local/pspdev/psp/sdk/../include -I. -IE:/cygwin/usr
/local/pspdev/psp/sdk/include -O3 -G0 -g -Wall -DHAVE_AV_CONFIG_H -c -o main.o
main.c
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s: Assembler messages:
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s:169: Error: bad expressi
on
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s:169: Error: bad expressi
on
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s:169: Error: illegal oper
ands `lv.q'
[... repeatedly for about 30 times]
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s:463: Error: VFPU registe
r conflict(M100)
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s:464: Error: VFPU registe
r conflict(M300)
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s:477: Error: VFPU registe
r conflict(M000)
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s:478: Error: VFPU registe
r conflict(M400)
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s:479: Error: VFPU registe
r conflict(M600)
/cygdrive/c/DOKUME~1/ALEXAN~1/LOKALE~1/Temp//cc7oUgBh.s:480: Error: VFPU registe
r conflict(M700)
make: *** [main.o] Error 1

And here's the function in question:

#define C2 2.0*0.99997651217454865478849954406816
#define C4 1.4142135623730950488016887242097
#define C6 2.0*0.0068538382841110272391868418010436

float __attribute__((aligned(64))) L[64] = { 1, 1, 1, 1, 1, 1, 1, 1,
1, C2-1, 1-C2+C4, C6-C4+C2-1, 1-C6+C4-C2, -1-C4+C2, 1-C2, -1,
1, -1+C4, -C4+1, -1, -1, -C4+1, -1+C4, 1,
1, C6-1, 1-C6-C4, -C2+C4+C6-1, 1+C2-C4-C6, -1+C4+C6, 1-C6, -1,
1, -1, -1, 1, 1, -1, -1, 1,
1, -C6-1, 1+C6-C4, C2+C4-C6-1, 1-C2-C4+C6, -1+C4-C6, 1+C6, -1,
1, -1-C4, C4+1, -1, -1, C4+1, -1-C4, 1,
1, -C2-1, 1+C2+C4, -C6-C4-C2-1, 1+C6+C4+C2, -1-C4-C2, 1+C2, -1 };

void fast_idct( short *block )
{
int __attribute__((aligned(64))) block32[64];
int i = 0;
for (i=0;i<64;i++)
block32 = block;

asm volatile(
/* Load the input matrix. Assumes row-major order in
memory, and each row is 16-byte aligned. Use ulv.q for
unaligned loads. */
// M100 = B(0)
"lv.q R100, 0+%0\n"
"lv.q R101, 32+%0\n"
"lv.q R102, 64+%0\n"
"lv.q R103, 96+%0\n"

// M300 = B(1)
"lv.q R300, 16+%0\n"
"lv.q R301, 48+%0\n"
"lv.q R302, 80+%0\n"
"lv.q R303, 112+%0\n"

// M000 = L(0)
"lv.q R000, 0+%1\n"
"lv.q R001, 32+%1\n"
"lv.q R002, 64+%1\n"
"lv.q R003, 96+%1\n"

// M600 = L(1)
"lv.q R600, 16+%1\n"
"lv.q R601, 48+%1\n"
"lv.q R602, 80+%1\n"
"lv.q R603, 112+%1\n"

// M400 = L(2)
"lv.q R400, 128+%1\n"
"lv.q R401, 160+%1\n"
"lv.q R402, 192+%1\n"
"lv.q R403, 224+%1\n"

// M700 = L(3)
"lv.q R700, 144+%1\n"
"lv.q R701, 176+%1\n"
"lv.q R702, 208+%1\n"
"lv.q R703, 240+%1\n"

// M200 = M100 * M000 = B(0) * L(0)
"vmmul.q M200, M100, M000\n"
// M500 = M300 * M400 = B(1) * L(2)
"vmmul.q M500, M300, M400\n"
// D(0) = M200 = M200 + M500 = B(0) * L(0) + B(1) * L(2)
"vadd.q R200, R200, R500\n"
"vadd.q R201, R201, R501\n"
"vadd.q R202, R202, R502\n"
"vadd.q R203, R203, R503\n"

"vmmul.q M100, M100, M600\n"
"vmmul.q M300, M300, M700\n"

// D(1) = M500 = M100 + M300 = B(0) * L(1) + B(1) * L(3)
"vadd.q R500, R100, R300\n"
"vadd.q R501, R101, R301\n"
"vadd.q R502, R102, R302\n"
"vadd.q R503, R103, R303\n"

// M100 = B(2)
"lv.q R100, 128+%0\n"
"lv.q R101, 160+%0\n"
"lv.q R102, 192+%0\n"
"lv.q R103, 224+%0\n"

// M300 = B(3)
"lv.q R300, 144+%0\n"
"lv.q R301, 176+%0\n"
"lv.q R302, 208+%0\n"
"lv.q R303, 240+%0\n"

"vmmul.q M000, M100, M000\n"
"vmmul.q M400, M300, M400\n"
"vmmul.q M600, M100, M600\n"
"vmmul.q M700, M300, M700\n"

// D(2)
"vadd.q R000, R000, R400\n"
"vadd.q R001, R001, R401\n"
"vadd.q R002, R002, R402\n"
"vadd.q R003, R003, R403\n"

// D(3)
"vadd.q R600, R600, R700\n"
"vadd.q R601, R601, R701\n"
"vadd.q R602, R602, R702\n"
"vadd.q R603, R603, R703\n"

/* Convert float to int (truncated) */
"vf2iz.q R200, R200, 0\n"
"vf2iz.q R201, R201, 0\n"
"vf2iz.q R202, R202, 0\n"
"vf2iz.q R203, R203, 0\n"

"vf2iz.q R500, R500, 0\n"
"vf2iz.q R501, R501, 0\n"
"vf2iz.q R502, R502, 0\n"
"vf2iz.q R503, R503, 0\n"

"vf2iz.q R000, R000, 0\n"
"vf2iz.q R001, R001, 0\n"
"vf2iz.q R002, R002, 0\n"
"vf2iz.q R003, R003, 0\n"

"vf2iz.q R600, R600, 0\n"
"vf2iz.q R601, R601, 0\n"
"vf2iz.q R602, R602, 0\n"
"vf2iz.q R603, R603, 0\n"

/* Store result; use usv.q for unaligned */
"sv.q R200, 0+%0\n"
"sv.q R201, 32+%0\n"
"sv.q R202, 64+%0\n"
"sv.q R203, 96+%0\n"

"sv.q R500, 16+%0\n"
"sv.q R501, 48+%0\n"
"sv.q R502, 80+%0\n"
"sv.q R503, 112+%0\n"

"sv.q R000, 128+%0\n"
"sv.q R001, 160+%0\n"
"sv.q R002, 192+%0\n"
"sv.q R003, 224+%0\n"

"sv.q R600, 144+%0\n"
"sv.q R601, 176+%0\n"
"sv.q R602, 208+%0\n"
"sv.q R603, 240+%0\n"
: "+m" (block32),"+m" (L));

for (i=0;i<64;i++)
block = block32;
}

So the problem seems to be with the lv.q and the operators, but I cannot see what is wrong. I also tried the offs(%op) notation, but it's the same.

I don't care about optimization for now, but if anyone has information on how to avoid stalls and such, it is appreciated nonetheless.

Thanks for every help.