Code: Select all
#define EXPAND_16_TIMES(CODE) CODE CODE CODE CODE CODE CODE CODE CODE CODE CODE CODE CODE CODE CODE CODE CODE
void Adapt(short * pM, const short * pAdapt, int nDirection, int nOrder)
{
nDirection = -nDirection;
nOrder >>= 4;
if (nDirection < 0)
{
while (nOrder--)
{
EXPAND_16_TIMES(*pM++ += *pAdapt++;)
}
}
else if (nDirection > 0)
{
while (nOrder--)
{
EXPAND_16_TIMES(*pM++ -= *pAdapt++;)
}
}
}
so I wrote that
Code: Select all
#define vfpuadd16 \
__asm__ volatile( \
".set push\n" \
".set noreorder\n" \
"lv.q R100, 0+%0\n" \
"lv.q R000, 0+%1\n" \
"vadd.q R100, R100, R000\n" \
"sv.q R100, 0+%0\n" \
"lv.q R101, 16+%0\n" \
"lv.q R001, 16+%1\n" \
"vadd.q R101, R101, R001\n" \
"sv.q R101, 16+%0\n" \
"lv.q R102, 32+%0\n" \
"lv.q R002, 32+%1\n" \
"vadd.q R102, R102, R002\n" \
"sv.q R102, 32+%0\n" \
"lv.q R103, 48+%0\n" \
"lv.q R003, 48+%1\n" \
"vadd.q R103, R103, R003\n" \
"sv.q R103, 48+%0\n" \
".set pop\n" \
: "+m" (blockM32), \
"+m" (blockAdapt32) ) ;
#define vfpusub16 \
__asm__ volatile( \
".set push\n" \
".set noreorder\n" \
"lv.q R100, 0+%0\n" \
"lv.q R000, 0+%1\n" \
"vsub.q R100, R100, R000\n" \
"sv.q R100, 0+%0\n" \
"lv.q R101, 16+%0\n" \
"lv.q R001, 16+%1\n" \
"vsub.q R101, R101, R001\n" \
"sv.q R101, 16+%0\n" \
"lv.q R102, 32+%0\n" \
"lv.q R002, 32+%1\n" \
"vsub.q R102, R102, R002\n" \
"sv.q R102, 32+%0\n" \
"lv.q R103, 48+%0\n" \
"lv.q R003, 48+%1\n" \
"vsub.q R103, R103, R003\n" \
"sv.q R103, 48+%0\n" \
".set pop\n" \
: "+m" (blockM32), \
"+m" (blockAdapt32) ) ;
static inline void AdaptVFPUAdd(short * pM, const short * pAdapt) {
float __attribute__((aligned(64))) blockM32[16];
float __attribute__((aligned(64))) blockAdapt32[16];
int i;
for(i = 0; i < 16; i++)
{
blockM32[i] = *(pM+i);
blockAdapt32[i] = *(pAdapt+i);
}
vfpuadd16;
for(i = 0; i < 16; i++)
{
*(pM+i) = (short)blockM32[i];
}
}
static inline void AdaptVFPUSub(short * pM, const short * pAdapt) {
float __attribute__((aligned(64))) blockM32[16];
float __attribute__((aligned(64))) blockAdapt32[16];
int i;
for(i = 0; i < 16; i++)
{
blockM32[i] = *(pM+i);
blockAdapt32[i] = *(pAdapt+i);
}
vfpusub16;
for(i = 0; i < 16; i++)
{
*(pM+i) = (short)blockM32[i];
}
}
void Adapt(short * pM, const short * pAdapt, int nDirection, int nOrder)
{
nDirection = -nDirection;
nOrder >>= 4;
if (nDirection < 0)
{
while (nOrder--)
{
AdaptVFPUAdd(pM, pAdapt);
pM+=16;
pAdapt+=16;
//EXPAND_16_TIMES(*pM++ += *pAdapt++;)
}
}
else if (nDirection > 0)
{
while (nOrder--)
{
AdaptVFPUSub(pM, pAdapt);
pM+=16;
pAdapt+=16;
//EXPAND_16_TIMES(*pM++ -= *pAdapt++;)
}
}
}