00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_fast_q31.c 00009 * 00010 * Description: Q31 Convolution (fast version). 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * -------------------------------------------------------------------- */ 00029 00030 #include "arm_math.h" 00031 00068 void arm_conv_fast_q31( 00069 q31_t * pSrcA, 00070 uint32_t srcALen, 00071 q31_t * pSrcB, 00072 uint32_t srcBLen, 00073 q31_t * pDst) 00074 { 00075 q31_t *pIn1; /* inputA pointer */ 00076 q31_t *pIn2; /* inputB pointer */ 00077 q31_t *pOut = pDst; /* output pointer */ 00078 q31_t *px; /* Intermediate inputA pointer */ 00079 q31_t *py; /* Intermediate inputB pointer */ 00080 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00081 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00082 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00083 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00084 00085 00086 /* The algorithm implementation is based on the lengths of the inputs. */ 00087 /* srcB is always made to slide across srcA. */ 00088 /* So srcBLen is always considered as shorter or equal to srcALen */ 00089 if(srcALen >= srcBLen) 00090 { 00091 /* Initialization of inputA pointer */ 00092 pIn1 = pSrcA; 00093 00094 /* Initialization of inputB pointer */ 00095 pIn2 = pSrcB; 00096 } 00097 else 00098 { 00099 /* Initialization of inputA pointer */ 00100 pIn1 = pSrcB; 00101 00102 /* Initialization of inputB pointer */ 00103 pIn2 = pSrcA; 00104 00105 /* srcBLen is always considered as shorter or equal to srcALen */ 00106 j = srcBLen; 00107 srcBLen = srcALen; 00108 srcALen = j; 00109 } 00110 00111 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00112 /* The function is internally 00113 * divided into three stages according to the number of multiplications that has to be 00114 * taken place between inputA samples and inputB samples. In the first stage of the 00115 * algorithm, the multiplications increase by one for every iteration. 00116 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00117 * In the third stage of the algorithm, the multiplications decrease by one 00118 * for every iteration. */ 00119 00120 /* The algorithm is implemented in three stages. 00121 The loop counters of each stage is initiated here. */ 00122 blockSize1 = srcBLen - 1u; 00123 blockSize2 = srcALen - (srcBLen - 1u); 00124 blockSize3 = blockSize1; 00125 00126 /* -------------------------- 00127 * Initializations of stage1 00128 * -------------------------*/ 00129 00130 /* sum = x[0] * y[0] 00131 * sum = x[0] * y[1] + x[1] * y[0] 00132 * .... 00133 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00134 */ 00135 00136 /* In this stage the MAC operations are increased by 1 for every iteration. 00137 The count variable holds the number of MAC operations performed */ 00138 count = 1u; 00139 00140 /* Working pointer of inputA */ 00141 px = pIn1; 00142 00143 /* Working pointer of inputB */ 00144 py = pIn2; 00145 00146 00147 /* ------------------------ 00148 * Stage1 process 00149 * ----------------------*/ 00150 00151 /* The first stage starts here */ 00152 while(blockSize1 > 0u) 00153 { 00154 /* Accumulator is made zero for every iteration */ 00155 sum = 0; 00156 00157 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00158 k = count >> 2u; 00159 00160 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00161 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00162 while(k > 0u) 00163 { 00164 /* x[0] * y[srcBLen - 1] */ 00165 sum = (q31_t) ((((q63_t) sum << 32) + 00166 ((q63_t) * px++ * (*py--))) >> 32); 00167 00168 /* x[1] * y[srcBLen - 2] */ 00169 sum = (q31_t) ((((q63_t) sum << 32) + 00170 ((q63_t) * px++ * (*py--))) >> 32); 00171 00172 /* x[2] * y[srcBLen - 3] */ 00173 sum = (q31_t) ((((q63_t) sum << 32) + 00174 ((q63_t) * px++ * (*py--))) >> 32); 00175 00176 /* x[3] * y[srcBLen - 4] */ 00177 sum = (q31_t) ((((q63_t) sum << 32) + 00178 ((q63_t) * px++ * (*py--))) >> 32); 00179 00180 /* Decrement the loop counter */ 00181 k--; 00182 } 00183 00184 /* If the count is not a multiple of 4, compute any remaining MACs here. 00185 ** No loop unrolling is used. */ 00186 k = count % 0x4u; 00187 00188 while(k > 0u) 00189 { 00190 /* Perform the multiply-accumulate */ 00191 sum = (q31_t) ((((q63_t) sum << 32) + 00192 ((q63_t) * px++ * (*py--))) >> 32); 00193 00194 /* Decrement the loop counter */ 00195 k--; 00196 } 00197 00198 /* Store the result in the accumulator in the destination buffer. */ 00199 *pOut++ = sum << 1; 00200 00201 /* Update the inputA and inputB pointers for next MAC calculation */ 00202 py = pIn2 + count; 00203 px = pIn1; 00204 00205 /* Increment the MAC count */ 00206 count++; 00207 00208 /* Decrement the loop counter */ 00209 blockSize1--; 00210 } 00211 00212 /* -------------------------- 00213 * Initializations of stage2 00214 * ------------------------*/ 00215 00216 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00217 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00218 * .... 00219 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00220 */ 00221 00222 /* Working pointer of inputA */ 00223 px = pIn1; 00224 00225 /* Working pointer of inputB */ 00226 pSrc2 = pIn2 + (srcBLen - 1u); 00227 py = pSrc2; 00228 00229 /* count is index by which the pointer pIn1 to be incremented */ 00230 count = 1u; 00231 00232 /* ------------------- 00233 * Stage2 process 00234 * ------------------*/ 00235 00236 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00237 * So, to loop unroll over blockSize2, 00238 * srcBLen should be greater than or equal to 4 */ 00239 if(srcBLen >= 4u) 00240 { 00241 /* Loop unroll over blockSize2, by 4 */ 00242 blkCnt = blockSize2 >> 2u; 00243 00244 while(blkCnt > 0u) 00245 { 00246 /* Set all accumulators to zero */ 00247 acc0 = 0; 00248 acc1 = 0; 00249 acc2 = 0; 00250 acc3 = 0; 00251 00252 /* read x[0], x[1], x[2] samples */ 00253 x0 = *(px++); 00254 x1 = *(px++); 00255 x2 = *(px++); 00256 00257 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00258 k = srcBLen >> 2u; 00259 00260 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00261 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00262 do 00263 { 00264 /* Read y[srcBLen - 1] sample */ 00265 c0 = *(py--); 00266 00267 /* Read x[3] sample */ 00268 x3 = *(px++); 00269 00270 /* Perform the multiply-accumulates */ 00271 /* acc0 += x[0] * y[srcBLen - 1] */ 00272 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00273 00274 /* acc1 += x[1] * y[srcBLen - 1] */ 00275 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00276 00277 /* acc2 += x[2] * y[srcBLen - 1] */ 00278 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00279 00280 /* acc3 += x[3] * y[srcBLen - 1] */ 00281 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00282 00283 /* Read y[srcBLen - 2] sample */ 00284 c0 = *(py--); 00285 00286 /* Read x[4] sample */ 00287 x0 = *(px++); 00288 00289 /* Perform the multiply-accumulate */ 00290 /* acc0 += x[1] * y[srcBLen - 2] */ 00291 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00292 /* acc1 += x[2] * y[srcBLen - 2] */ 00293 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00294 /* acc2 += x[3] * y[srcBLen - 2] */ 00295 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00296 /* acc3 += x[4] * y[srcBLen - 2] */ 00297 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00298 00299 /* Read y[srcBLen - 3] sample */ 00300 c0 = *(py--); 00301 00302 /* Read x[5] sample */ 00303 x1 = *(px++); 00304 00305 /* Perform the multiply-accumulates */ 00306 /* acc0 += x[2] * y[srcBLen - 3] */ 00307 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00308 /* acc1 += x[3] * y[srcBLen - 2] */ 00309 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00310 /* acc2 += x[4] * y[srcBLen - 2] */ 00311 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00312 /* acc3 += x[5] * y[srcBLen - 2] */ 00313 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00314 00315 /* Read y[srcBLen - 4] sample */ 00316 c0 = *(py--); 00317 00318 /* Read x[6] sample */ 00319 x2 = *(px++); 00320 00321 /* Perform the multiply-accumulates */ 00322 /* acc0 += x[3] * y[srcBLen - 4] */ 00323 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00324 /* acc1 += x[4] * y[srcBLen - 4] */ 00325 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00326 /* acc2 += x[5] * y[srcBLen - 4] */ 00327 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00328 /* acc3 += x[6] * y[srcBLen - 4] */ 00329 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00330 00331 00332 } while(--k); 00333 00334 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00335 ** No loop unrolling is used. */ 00336 k = srcBLen % 0x4u; 00337 00338 while(k > 0u) 00339 { 00340 /* Read y[srcBLen - 5] sample */ 00341 c0 = *(py--); 00342 00343 /* Read x[7] sample */ 00344 x3 = *(px++); 00345 00346 /* Perform the multiply-accumulates */ 00347 /* acc0 += x[4] * y[srcBLen - 5] */ 00348 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00349 /* acc1 += x[5] * y[srcBLen - 5] */ 00350 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00351 /* acc2 += x[6] * y[srcBLen - 5] */ 00352 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00353 /* acc3 += x[7] * y[srcBLen - 5] */ 00354 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00355 00356 /* Reuse the present samples for the next MAC */ 00357 x0 = x1; 00358 x1 = x2; 00359 x2 = x3; 00360 00361 /* Decrement the loop counter */ 00362 k--; 00363 } 00364 00365 /* Store the results in the accumulators in the destination buffer. */ 00366 *pOut++ = (q31_t) (acc0 << 1); 00367 *pOut++ = (q31_t) (acc1 << 1); 00368 *pOut++ = (q31_t) (acc2 << 1); 00369 *pOut++ = (q31_t) (acc3 << 1); 00370 00371 /* Update the inputA and inputB pointers for next MAC calculation */ 00372 px = pIn1 + (count * 4u); 00373 py = pSrc2; 00374 00375 /* Increment the pointer pIn1 index, count by 1 */ 00376 count++; 00377 00378 /* Decrement the loop counter */ 00379 blkCnt--; 00380 } 00381 00382 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00383 ** No loop unrolling is used. */ 00384 blkCnt = blockSize2 % 0x4u; 00385 00386 while(blkCnt > 0u) 00387 { 00388 /* Accumulator is made zero for every iteration */ 00389 sum = 0; 00390 00391 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00392 k = srcBLen >> 2u; 00393 00394 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00395 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00396 while(k > 0u) 00397 { 00398 /* Perform the multiply-accumulates */ 00399 sum = (q31_t) ((((q63_t) sum << 32) + 00400 ((q63_t) * px++ * (*py--))) >> 32); 00401 sum = (q31_t) ((((q63_t) sum << 32) + 00402 ((q63_t) * px++ * (*py--))) >> 32); 00403 sum = (q31_t) ((((q63_t) sum << 32) + 00404 ((q63_t) * px++ * (*py--))) >> 32); 00405 sum = (q31_t) ((((q63_t) sum << 32) + 00406 ((q63_t) * px++ * (*py--))) >> 32); 00407 00408 /* Decrement the loop counter */ 00409 k--; 00410 } 00411 00412 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00413 ** No loop unrolling is used. */ 00414 k = srcBLen % 0x4u; 00415 00416 while(k > 0u) 00417 { 00418 /* Perform the multiply-accumulate */ 00419 sum = (q31_t) ((((q63_t) sum << 32) + 00420 ((q63_t) * px++ * (*py--))) >> 32); 00421 00422 /* Decrement the loop counter */ 00423 k--; 00424 } 00425 00426 /* Store the result in the accumulator in the destination buffer. */ 00427 *pOut++ = sum << 1; 00428 00429 /* Update the inputA and inputB pointers for next MAC calculation */ 00430 px = pIn1 + count; 00431 py = pSrc2; 00432 00433 /* Increment the MAC count */ 00434 count++; 00435 00436 /* Decrement the loop counter */ 00437 blkCnt--; 00438 } 00439 } 00440 else 00441 { 00442 /* If the srcBLen is not a multiple of 4, 00443 * the blockSize2 loop cannot be unrolled by 4 */ 00444 blkCnt = blockSize2; 00445 00446 while(blkCnt > 0u) 00447 { 00448 /* Accumulator is made zero for every iteration */ 00449 sum = 0; 00450 00451 /* srcBLen number of MACS should be performed */ 00452 k = srcBLen; 00453 00454 while(k > 0u) 00455 { 00456 /* Perform the multiply-accumulate */ 00457 sum = (q31_t) ((((q63_t) sum << 32) + 00458 ((q63_t) * px++ * (*py--))) >> 32); 00459 00460 /* Decrement the loop counter */ 00461 k--; 00462 } 00463 00464 /* Store the result in the accumulator in the destination buffer. */ 00465 *pOut++ = sum << 1; 00466 00467 /* Update the inputA and inputB pointers for next MAC calculation */ 00468 px = pIn1 + count; 00469 py = pSrc2; 00470 00471 /* Increment the MAC count */ 00472 count++; 00473 00474 /* Decrement the loop counter */ 00475 blkCnt--; 00476 } 00477 } 00478 00479 00480 /* -------------------------- 00481 * Initializations of stage3 00482 * -------------------------*/ 00483 00484 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00485 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00486 * .... 00487 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00488 * sum += x[srcALen-1] * y[srcBLen-1] 00489 */ 00490 00491 /* In this stage the MAC operations are decreased by 1 for every iteration. 00492 The blockSize3 variable holds the number of MAC operations performed */ 00493 00494 /* Working pointer of inputA */ 00495 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00496 px = pSrc1; 00497 00498 /* Working pointer of inputB */ 00499 pSrc2 = pIn2 + (srcBLen - 1u); 00500 py = pSrc2; 00501 00502 /* ------------------- 00503 * Stage3 process 00504 * ------------------*/ 00505 00506 while(blockSize3 > 0u) 00507 { 00508 /* Accumulator is made zero for every iteration */ 00509 sum = 0; 00510 00511 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00512 k = blockSize3 >> 2u; 00513 00514 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00515 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00516 while(k > 0u) 00517 { 00518 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00519 sum = (q31_t) ((((q63_t) sum << 32) + 00520 ((q63_t) * px++ * (*py--))) >> 32); 00521 00522 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00523 sum = (q31_t) ((((q63_t) sum << 32) + 00524 ((q63_t) * px++ * (*py--))) >> 32); 00525 00526 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00527 sum = (q31_t) ((((q63_t) sum << 32) + 00528 ((q63_t) * px++ * (*py--))) >> 32); 00529 00530 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00531 sum = (q31_t) ((((q63_t) sum << 32) + 00532 ((q63_t) * px++ * (*py--))) >> 32); 00533 00534 /* Decrement the loop counter */ 00535 k--; 00536 } 00537 00538 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00539 ** No loop unrolling is used. */ 00540 k = blockSize3 % 0x4u; 00541 00542 while(k > 0u) 00543 { 00544 /* Perform the multiply-accumulate */ 00545 sum = (q31_t) ((((q63_t) sum << 32) + 00546 ((q63_t) * px++ * (*py--))) >> 32); 00547 00548 /* Decrement the loop counter */ 00549 k--; 00550 } 00551 00552 /* Store the result in the accumulator in the destination buffer. */ 00553 *pOut++ = sum << 1; 00554 00555 /* Update the inputA and inputB pointers for next MAC calculation */ 00556 px = ++pSrc1; 00557 py = pSrc2; 00558 00559 /* Decrement the loop counter */ 00560 blockSize3--; 00561 } 00562 00563 } 00564