00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q31.c 00009 * 00010 * Description: Convolution of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00071 void arm_conv_q31( 00072 q31_t * pSrcA, 00073 uint32_t srcALen, 00074 q31_t * pSrcB, 00075 uint32_t srcBLen, 00076 q31_t * pDst) 00077 { 00078 00079 00080 #ifndef ARM_MATH_CM0 00081 00082 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00083 00084 q31_t *pIn1; /* inputA pointer */ 00085 q31_t *pIn2; /* inputB pointer */ 00086 q31_t *pOut = pDst; /* output pointer */ 00087 q31_t *px; /* Intermediate inputA pointer */ 00088 q31_t *py; /* Intermediate inputB pointer */ 00089 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00090 q63_t sum; /* Accumulator */ 00091 q63_t acc0, acc1, acc2, acc3; /* Accumulator */ 00092 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00093 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00094 00095 00096 /* The algorithm implementation is based on the lengths of the inputs. */ 00097 /* srcB is always made to slide across srcA. */ 00098 /* So srcBLen is always considered as shorter or equal to srcALen */ 00099 if(srcALen >= srcBLen) 00100 { 00101 /* Initialization of inputA pointer */ 00102 pIn1 = pSrcA; 00103 00104 /* Initialization of inputB pointer */ 00105 pIn2 = pSrcB; 00106 } 00107 else 00108 { 00109 /* Initialization of inputA pointer */ 00110 pIn1 = (q31_t *) pSrcB; 00111 00112 /* Initialization of inputB pointer */ 00113 pIn2 = (q31_t *) pSrcA; 00114 00115 /* srcBLen is always considered as shorter or equal to srcALen */ 00116 j = srcBLen; 00117 srcBLen = srcALen; 00118 srcALen = j; 00119 } 00120 00121 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00122 /* The function is internally 00123 * divided into three stages according to the number of multiplications that has to be 00124 * taken place between inputA samples and inputB samples. In the first stage of the 00125 * algorithm, the multiplications increase by one for every iteration. 00126 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00127 * In the third stage of the algorithm, the multiplications decrease by one 00128 * for every iteration. */ 00129 00130 /* The algorithm is implemented in three stages. 00131 The loop counters of each stage is initiated here. */ 00132 blockSize1 = srcBLen - 1u; 00133 blockSize2 = srcALen - (srcBLen - 1u); 00134 blockSize3 = blockSize1; 00135 00136 /* -------------------------- 00137 * Initializations of stage1 00138 * -------------------------*/ 00139 00140 /* sum = x[0] * y[0] 00141 * sum = x[0] * y[1] + x[1] * y[0] 00142 * .... 00143 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00144 */ 00145 00146 /* In this stage the MAC operations are increased by 1 for every iteration. 00147 The count variable holds the number of MAC operations performed */ 00148 count = 1u; 00149 00150 /* Working pointer of inputA */ 00151 px = pIn1; 00152 00153 /* Working pointer of inputB */ 00154 py = pIn2; 00155 00156 00157 /* ------------------------ 00158 * Stage1 process 00159 * ----------------------*/ 00160 00161 /* The first stage starts here */ 00162 while(blockSize1 > 0u) 00163 { 00164 /* Accumulator is made zero for every iteration */ 00165 sum = 0; 00166 00167 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00168 k = count >> 2u; 00169 00170 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00171 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00172 while(k > 0u) 00173 { 00174 /* x[0] * y[srcBLen - 1] */ 00175 sum += (q63_t) * px++ * (*py--); 00176 /* x[1] * y[srcBLen - 2] */ 00177 sum += (q63_t) * px++ * (*py--); 00178 /* x[2] * y[srcBLen - 3] */ 00179 sum += (q63_t) * px++ * (*py--); 00180 /* x[3] * y[srcBLen - 4] */ 00181 sum += (q63_t) * px++ * (*py--); 00182 00183 /* Decrement the loop counter */ 00184 k--; 00185 } 00186 00187 /* If the count is not a multiple of 4, compute any remaining MACs here. 00188 ** No loop unrolling is used. */ 00189 k = count % 0x4u; 00190 00191 while(k > 0u) 00192 { 00193 /* Perform the multiply-accumulate */ 00194 sum += (q63_t) * px++ * (*py--); 00195 00196 /* Decrement the loop counter */ 00197 k--; 00198 } 00199 00200 /* Store the result in the accumulator in the destination buffer. */ 00201 *pOut++ = (q31_t) (sum >> 31); 00202 00203 /* Update the inputA and inputB pointers for next MAC calculation */ 00204 py = pIn2 + count; 00205 px = pIn1; 00206 00207 /* Increment the MAC count */ 00208 count++; 00209 00210 /* Decrement the loop counter */ 00211 blockSize1--; 00212 } 00213 00214 /* -------------------------- 00215 * Initializations of stage2 00216 * ------------------------*/ 00217 00218 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00219 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00220 * .... 00221 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00222 */ 00223 00224 /* Working pointer of inputA */ 00225 px = pIn1; 00226 00227 /* Working pointer of inputB */ 00228 pSrc2 = pIn2 + (srcBLen - 1u); 00229 py = pSrc2; 00230 00231 /* count is index by which the pointer pIn1 to be incremented */ 00232 count = 1u; 00233 00234 /* ------------------- 00235 * Stage2 process 00236 * ------------------*/ 00237 00238 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00239 * So, to loop unroll over blockSize2, 00240 * srcBLen should be greater than or equal to 4 */ 00241 if(srcBLen >= 4u) 00242 { 00243 /* Loop unroll over blockSize2, by 4 */ 00244 blkCnt = blockSize2 >> 2u; 00245 00246 while(blkCnt > 0u) 00247 { 00248 /* Set all accumulators to zero */ 00249 acc0 = 0; 00250 acc1 = 0; 00251 acc2 = 0; 00252 acc3 = 0; 00253 00254 /* read x[0], x[1], x[2] samples */ 00255 x0 = *(px++); 00256 x1 = *(px++); 00257 x2 = *(px++); 00258 00259 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00260 k = srcBLen >> 2u; 00261 00262 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00263 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00264 do 00265 { 00266 /* Read y[srcBLen - 1] sample */ 00267 c0 = *(py--); 00268 00269 /* Read x[3] sample */ 00270 x3 = *(px++); 00271 00272 /* Perform the multiply-accumulates */ 00273 /* acc0 += x[0] * y[srcBLen - 1] */ 00274 acc0 += ((q63_t) x0 * c0); 00275 /* acc1 += x[1] * y[srcBLen - 1] */ 00276 acc1 += ((q63_t) x1 * c0); 00277 /* acc2 += x[2] * y[srcBLen - 1] */ 00278 acc2 += ((q63_t) x2 * c0); 00279 /* acc3 += x[3] * y[srcBLen - 1] */ 00280 acc3 += ((q63_t) x3 * c0); 00281 00282 /* Read y[srcBLen - 2] sample */ 00283 c0 = *(py--); 00284 00285 /* Read x[4] sample */ 00286 x0 = *(px++); 00287 00288 /* Perform the multiply-accumulate */ 00289 /* acc0 += x[1] * y[srcBLen - 2] */ 00290 acc0 += ((q63_t) x1 * c0); 00291 /* acc1 += x[2] * y[srcBLen - 2] */ 00292 acc1 += ((q63_t) x2 * c0); 00293 /* acc2 += x[3] * y[srcBLen - 2] */ 00294 acc2 += ((q63_t) x3 * c0); 00295 /* acc3 += x[4] * y[srcBLen - 2] */ 00296 acc3 += ((q63_t) x0 * c0); 00297 00298 /* Read y[srcBLen - 3] sample */ 00299 c0 = *(py--); 00300 00301 /* Read x[5] sample */ 00302 x1 = *(px++); 00303 00304 /* Perform the multiply-accumulates */ 00305 /* acc0 += x[2] * y[srcBLen - 3] */ 00306 acc0 += ((q63_t) x2 * c0); 00307 /* acc1 += x[3] * y[srcBLen - 2] */ 00308 acc1 += ((q63_t) x3 * c0); 00309 /* acc2 += x[4] * y[srcBLen - 2] */ 00310 acc2 += ((q63_t) x0 * c0); 00311 /* acc3 += x[5] * y[srcBLen - 2] */ 00312 acc3 += ((q63_t) x1 * c0); 00313 00314 /* Read y[srcBLen - 4] sample */ 00315 c0 = *(py--); 00316 00317 /* Read x[6] sample */ 00318 x2 = *(px++); 00319 00320 /* Perform the multiply-accumulates */ 00321 /* acc0 += x[3] * y[srcBLen - 4] */ 00322 acc0 += ((q63_t) x3 * c0); 00323 /* acc1 += x[4] * y[srcBLen - 4] */ 00324 acc1 += ((q63_t) x0 * c0); 00325 /* acc2 += x[5] * y[srcBLen - 4] */ 00326 acc2 += ((q63_t) x1 * c0); 00327 /* acc3 += x[6] * y[srcBLen - 4] */ 00328 acc3 += ((q63_t) x2 * c0); 00329 00330 } while(--k); 00331 00332 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00333 ** No loop unrolling is used. */ 00334 k = srcBLen % 0x4u; 00335 00336 while(k > 0u) 00337 { 00338 /* Read y[srcBLen - 5] sample */ 00339 c0 = *(py--); 00340 00341 /* Read x[7] sample */ 00342 x3 = *(px++); 00343 00344 /* Perform the multiply-accumulates */ 00345 /* acc0 += x[4] * y[srcBLen - 5] */ 00346 acc0 += ((q63_t) x0 * c0); 00347 /* acc1 += x[5] * y[srcBLen - 5] */ 00348 acc1 += ((q63_t) x1 * c0); 00349 /* acc2 += x[6] * y[srcBLen - 5] */ 00350 acc2 += ((q63_t) x2 * c0); 00351 /* acc3 += x[7] * y[srcBLen - 5] */ 00352 acc3 += ((q63_t) x3 * c0); 00353 00354 /* Reuse the present samples for the next MAC */ 00355 x0 = x1; 00356 x1 = x2; 00357 x2 = x3; 00358 00359 /* Decrement the loop counter */ 00360 k--; 00361 } 00362 00363 /* Store the results in the accumulators in the destination buffer. */ 00364 *pOut++ = (q31_t) (acc0 >> 31); 00365 *pOut++ = (q31_t) (acc1 >> 31); 00366 *pOut++ = (q31_t) (acc2 >> 31); 00367 *pOut++ = (q31_t) (acc3 >> 31); 00368 00369 /* Update the inputA and inputB pointers for next MAC calculation */ 00370 px = pIn1 + (count * 4u); 00371 py = pSrc2; 00372 00373 /* Increment the pointer pIn1 index, count by 1 */ 00374 count++; 00375 00376 /* Decrement the loop counter */ 00377 blkCnt--; 00378 } 00379 00380 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00381 ** No loop unrolling is used. */ 00382 blkCnt = blockSize2 % 0x4u; 00383 00384 while(blkCnt > 0u) 00385 { 00386 /* Accumulator is made zero for every iteration */ 00387 sum = 0; 00388 00389 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00390 k = srcBLen >> 2u; 00391 00392 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00393 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00394 while(k > 0u) 00395 { 00396 /* Perform the multiply-accumulates */ 00397 sum += (q63_t) * px++ * (*py--); 00398 sum += (q63_t) * px++ * (*py--); 00399 sum += (q63_t) * px++ * (*py--); 00400 sum += (q63_t) * px++ * (*py--); 00401 00402 /* Decrement the loop counter */ 00403 k--; 00404 } 00405 00406 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00407 ** No loop unrolling is used. */ 00408 k = srcBLen % 0x4u; 00409 00410 while(k > 0u) 00411 { 00412 /* Perform the multiply-accumulate */ 00413 sum += (q63_t) * px++ * (*py--); 00414 00415 /* Decrement the loop counter */ 00416 k--; 00417 } 00418 00419 /* Store the result in the accumulator in the destination buffer. */ 00420 *pOut++ = (q31_t) (sum >> 31); 00421 00422 /* Update the inputA and inputB pointers for next MAC calculation */ 00423 px = pIn1 + count; 00424 py = pSrc2; 00425 00426 /* Increment the MAC count */ 00427 count++; 00428 00429 /* Decrement the loop counter */ 00430 blkCnt--; 00431 } 00432 } 00433 else 00434 { 00435 /* If the srcBLen is not a multiple of 4, 00436 * the blockSize2 loop cannot be unrolled by 4 */ 00437 blkCnt = blockSize2; 00438 00439 while(blkCnt > 0u) 00440 { 00441 /* Accumulator is made zero for every iteration */ 00442 sum = 0; 00443 00444 /* srcBLen number of MACS should be performed */ 00445 k = srcBLen; 00446 00447 while(k > 0u) 00448 { 00449 /* Perform the multiply-accumulate */ 00450 sum += (q63_t) * px++ * (*py--); 00451 00452 /* Decrement the loop counter */ 00453 k--; 00454 } 00455 00456 /* Store the result in the accumulator in the destination buffer. */ 00457 *pOut++ = (q31_t) (sum >> 31); 00458 00459 /* Update the inputA and inputB pointers for next MAC calculation */ 00460 px = pIn1 + count; 00461 py = pSrc2; 00462 00463 /* Increment the MAC count */ 00464 count++; 00465 00466 /* Decrement the loop counter */ 00467 blkCnt--; 00468 } 00469 } 00470 00471 00472 /* -------------------------- 00473 * Initializations of stage3 00474 * -------------------------*/ 00475 00476 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00477 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00478 * .... 00479 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00480 * sum += x[srcALen-1] * y[srcBLen-1] 00481 */ 00482 00483 /* In this stage the MAC operations are decreased by 1 for every iteration. 00484 The blockSize3 variable holds the number of MAC operations performed */ 00485 00486 /* Working pointer of inputA */ 00487 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00488 px = pSrc1; 00489 00490 /* Working pointer of inputB */ 00491 pSrc2 = pIn2 + (srcBLen - 1u); 00492 py = pSrc2; 00493 00494 /* ------------------- 00495 * Stage3 process 00496 * ------------------*/ 00497 00498 while(blockSize3 > 0u) 00499 { 00500 /* Accumulator is made zero for every iteration */ 00501 sum = 0; 00502 00503 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00504 k = blockSize3 >> 2u; 00505 00506 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00507 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00508 while(k > 0u) 00509 { 00510 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00511 sum += (q63_t) * px++ * (*py--); 00512 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00513 sum += (q63_t) * px++ * (*py--); 00514 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00515 sum += (q63_t) * px++ * (*py--); 00516 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00517 sum += (q63_t) * px++ * (*py--); 00518 00519 /* Decrement the loop counter */ 00520 k--; 00521 } 00522 00523 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00524 ** No loop unrolling is used. */ 00525 k = blockSize3 % 0x4u; 00526 00527 while(k > 0u) 00528 { 00529 /* Perform the multiply-accumulate */ 00530 sum += (q63_t) * px++ * (*py--); 00531 00532 /* Decrement the loop counter */ 00533 k--; 00534 } 00535 00536 /* Store the result in the accumulator in the destination buffer. */ 00537 *pOut++ = (q31_t) (sum >> 31); 00538 00539 /* Update the inputA and inputB pointers for next MAC calculation */ 00540 px = ++pSrc1; 00541 py = pSrc2; 00542 00543 /* Decrement the loop counter */ 00544 blockSize3--; 00545 } 00546 00547 #else 00548 00549 /* Run the below code for Cortex-M0 */ 00550 00551 q31_t *pIn1 = pSrcA; /* input pointer */ 00552 q31_t *pIn2 = pSrcB; /* coefficient pointer */ 00553 q63_t sum; /* Accumulator */ 00554 uint32_t i, j; /* loop counter */ 00555 00556 /* Loop to calculate output of convolution for output length number of times */ 00557 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00558 { 00559 /* Initialize sum with zero to carry on MAC operations */ 00560 sum = 0; 00561 00562 /* Loop to perform MAC operations according to convolution equation */ 00563 for (j = 0; j <= i; j++) 00564 { 00565 /* Check the array limitations */ 00566 if(((i - j) < srcBLen) && (j < srcALen)) 00567 { 00568 /* z[i] += x[i-j] * y[j] */ 00569 sum += ((q63_t) pIn1[j] * (pIn2[i - j])); 00570 } 00571 } 00572 00573 /* Store the output in the destination buffer */ 00574 pDst[i] = (q31_t) (sum >> 31u); 00575 } 00576 00577 #endif /* #ifndef ARM_MATH_CM0 */ 00578 00579 } 00580