I am writing a routines that loads 16 values in the VFPU.
The routine must check if the address in memory in 16-aligned and
if it is not, it must choose automatically the transferring int by int
The code is this:
#define GEN___Load16FloatsToMatrix(_mpTypeFunc_,_mpTag_,_mpNrMatrix_) \
\
\
_mpTypeFunc_ void _mpTag_##Load16FloatsToMatrix_m##_mpNrMatrix_ (float *Data) \
{ \
__asm__ volatile ( \
"and $5, %1, 0x0F\n" \
"bne $5, 0, _LOAD16FLOATSTOMATRIX_NOALLIGN_"#_mpNrMatrix_" \n" \
"lv.q C"#_mpNrMatrix_"00, 0 + %0\n" /* Provvedi a caricare usando il metodo veloce */ \
"lv.q C"#_mpNrMatrix_"10, 16 + %0\n" \
"lv.q C"#_mpNrMatrix_"20, 32 + %0\n" \
"lv.q C"#_mpNrMatrix_"30, 48 + %0\n" \
"b _LOAD16FLOATSTOMATRIX_ENDFUNC_"#_mpNrMatrix_" \n" \
\
"_LOAD16FLOATSTOMATRIX_NOALLIGN_"#_mpNrMatrix_": \n" \
\
"lv.s S"#_mpNrMatrix_"00, 0 + %0\n" /* Provvedi a caricare usando il metodo più lento */ \
"lv.s S"#_mpNrMatrix_"01, 4 + %0\n" \
"lv.s S"#_mpNrMatrix_"02, 8 + %0\n" \
"lv.s S"#_mpNrMatrix_"03, 12 + %0\n" \
\
"lv.s S"#_mpNrMatrix_"10, 16 + %0\n" \
"lv.s S"#_mpNrMatrix_"11, 20 + %0\n" \
"lv.s S"#_mpNrMatrix_"12, 24 + %0\n" \
"lv.s S"#_mpNrMatrix_"13, 28 + %0\n" \
\
"lv.s S"#_mpNrMatrix_"20, 32 + %0\n" \
"lv.s S"#_mpNrMatrix_"21, 36 + %0\n" \
"lv.s S"#_mpNrMatrix_"22, 40 + %0\n" \
"lv.s S"#_mpNrMatrix_"23, 44 + %0\n" \
\
"lv.s S"#_mpNrMatrix_"30, 48 + %0\n" \
"lv.s S"#_mpNrMatrix_"31, 52 + %0\n" \
"lv.s S"#_mpNrMatrix_"32, 56 + %0\n" \
"lv.s S"#_mpNrMatrix_"33, 60 + %0\n" \
\
"_LOAD16FLOATSTOMATRIX_ENDFUNC_"#_mpNrMatrix_": \n" \
\
: : "m"(*Data), "r"(*Data) : "$5"); \
\
return; \
}
// End macro
MACROGEN1d(inline, Load16FloatsToMatrix, ndEMI_)
The trouble is that CPU hangs. It seems that the method of AND 15
to check if the address in 16 aligned doesn't work.
Where is my error ?