00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_q31.c 00009 * 00010 * Description: Correlation of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00071 void arm_correlate_q31( 00072 q31_t * pSrcA, 00073 uint32_t srcALen, 00074 q31_t * pSrcB, 00075 uint32_t srcBLen, 00076 q31_t * pDst) 00077 { 00078 00079 #ifndef ARM_MATH_CM0 00080 00081 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00082 00083 q31_t *pIn1; /* inputA pointer */ 00084 q31_t *pIn2; /* inputB pointer */ 00085 q31_t *pOut = pDst; /* output pointer */ 00086 q31_t *px; /* Intermediate inputA pointer */ 00087 q31_t *py; /* Intermediate inputB pointer */ 00088 q31_t *pSrc1; /* Intermediate pointers */ 00089 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00090 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00091 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00092 int32_t inc = 1; /* Destination address modifier */ 00093 00094 00095 /* The algorithm implementation is based on the lengths of the inputs. */ 00096 /* srcB is always made to slide across srcA. */ 00097 /* So srcBLen is always considered as shorter or equal to srcALen */ 00098 /* But CORR(x, y) is reverse of CORR(y, x) */ 00099 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00100 /* and the destination pointer modifier, inc is set to -1 */ 00101 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00102 /* But to improve the performance, 00103 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00104 /* If srcALen > srcBLen, 00105 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00106 /* If srcALen < srcBLen, 00107 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00108 if(srcALen >= srcBLen) 00109 { 00110 /* Initialization of inputA pointer */ 00111 pIn1 = (pSrcA); 00112 00113 /* Initialization of inputB pointer */ 00114 pIn2 = (pSrcB); 00115 00116 /* Number of output samples is calculated */ 00117 outBlockSize = (2u * srcALen) - 1u; 00118 00119 /* When srcALen > srcBLen, zero padding is done to srcB 00120 * to make their lengths equal. 00121 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00122 * number of output samples are made zero */ 00123 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00124 00125 /* Updating the pointer position to non zero value */ 00126 pOut += j; 00127 00128 } 00129 else 00130 { 00131 /* Initialization of inputA pointer */ 00132 pIn1 = (pSrcB); 00133 00134 /* Initialization of inputB pointer */ 00135 pIn2 = (pSrcA); 00136 00137 /* srcBLen is always considered as shorter or equal to srcALen */ 00138 j = srcBLen; 00139 srcBLen = srcALen; 00140 srcALen = j; 00141 00142 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00143 /* Hence set the destination pointer to point to the last output sample */ 00144 pOut = pDst + ((srcALen + srcBLen) - 2u); 00145 00146 /* Destination address modifier is set to -1 */ 00147 inc = -1; 00148 00149 } 00150 00151 /* The function is internally 00152 * divided into three parts according to the number of multiplications that has to be 00153 * taken place between inputA samples and inputB samples. In the first part of the 00154 * algorithm, the multiplications increase by one for every iteration. 00155 * In the second part of the algorithm, srcBLen number of multiplications are done. 00156 * In the third part of the algorithm, the multiplications decrease by one 00157 * for every iteration.*/ 00158 /* The algorithm is implemented in three stages. 00159 * The loop counters of each stage is initiated here. */ 00160 blockSize1 = srcBLen - 1u; 00161 blockSize2 = srcALen - (srcBLen - 1u); 00162 blockSize3 = blockSize1; 00163 00164 /* -------------------------- 00165 * Initializations of stage1 00166 * -------------------------*/ 00167 00168 /* sum = x[0] * y[srcBlen - 1] 00169 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00170 * .... 00171 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00172 */ 00173 00174 /* In this stage the MAC operations are increased by 1 for every iteration. 00175 The count variable holds the number of MAC operations performed */ 00176 count = 1u; 00177 00178 /* Working pointer of inputA */ 00179 px = pIn1; 00180 00181 /* Working pointer of inputB */ 00182 pSrc1 = pIn2 + (srcBLen - 1u); 00183 py = pSrc1; 00184 00185 /* ------------------------ 00186 * Stage1 process 00187 * ----------------------*/ 00188 00189 /* The first stage starts here */ 00190 while(blockSize1 > 0u) 00191 { 00192 /* Accumulator is made zero for every iteration */ 00193 sum = 0; 00194 00195 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00196 k = count >> 2; 00197 00198 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00199 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00200 while(k > 0u) 00201 { 00202 /* x[0] * y[srcBLen - 4] */ 00203 sum += (q63_t) * px++ * (*py++); 00204 /* x[1] * y[srcBLen - 3] */ 00205 sum += (q63_t) * px++ * (*py++); 00206 /* x[2] * y[srcBLen - 2] */ 00207 sum += (q63_t) * px++ * (*py++); 00208 /* x[3] * y[srcBLen - 1] */ 00209 sum += (q63_t) * px++ * (*py++); 00210 00211 /* Decrement the loop counter */ 00212 k--; 00213 } 00214 00215 /* If the count is not a multiple of 4, compute any remaining MACs here. 00216 ** No loop unrolling is used. */ 00217 k = count % 0x4u; 00218 00219 while(k > 0u) 00220 { 00221 /* Perform the multiply-accumulates */ 00222 /* x[0] * y[srcBLen - 1] */ 00223 sum += (q63_t) * px++ * (*py++); 00224 00225 /* Decrement the loop counter */ 00226 k--; 00227 } 00228 00229 /* Store the result in the accumulator in the destination buffer. */ 00230 *pOut = (q31_t) (sum >> 31); 00231 /* Destination pointer is updated according to the address modifier, inc */ 00232 pOut += inc; 00233 00234 /* Update the inputA and inputB pointers for next MAC calculation */ 00235 py = pSrc1 - count; 00236 px = pIn1; 00237 00238 /* Increment the MAC count */ 00239 count++; 00240 00241 /* Decrement the loop counter */ 00242 blockSize1--; 00243 } 00244 00245 /* -------------------------- 00246 * Initializations of stage2 00247 * ------------------------*/ 00248 00249 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00250 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00251 * .... 00252 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00253 */ 00254 00255 /* Working pointer of inputA */ 00256 px = pIn1; 00257 00258 /* Working pointer of inputB */ 00259 py = pIn2; 00260 00261 /* count is index by which the pointer pIn1 to be incremented */ 00262 count = 1u; 00263 00264 /* ------------------- 00265 * Stage2 process 00266 * ------------------*/ 00267 00268 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00269 * So, to loop unroll over blockSize2, 00270 * srcBLen should be greater than or equal to 4 */ 00271 if(srcBLen >= 4u) 00272 { 00273 /* Loop unroll over blockSize2, by 4 */ 00274 blkCnt = blockSize2 >> 2u; 00275 00276 while(blkCnt > 0u) 00277 { 00278 /* Set all accumulators to zero */ 00279 acc0 = 0; 00280 acc1 = 0; 00281 acc2 = 0; 00282 acc3 = 0; 00283 00284 /* read x[0], x[1], x[2] samples */ 00285 x0 = *(px++); 00286 x1 = *(px++); 00287 x2 = *(px++); 00288 00289 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00290 k = srcBLen >> 2u; 00291 00292 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00293 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00294 do 00295 { 00296 /* Read y[0] sample */ 00297 c0 = *(py++); 00298 00299 /* Read x[3] sample */ 00300 x3 = *(px++); 00301 00302 /* Perform the multiply-accumulate */ 00303 /* acc0 += x[0] * y[0] */ 00304 acc0 += ((q63_t) x0 * c0); 00305 /* acc1 += x[1] * y[0] */ 00306 acc1 += ((q63_t) x1 * c0); 00307 /* acc2 += x[2] * y[0] */ 00308 acc2 += ((q63_t) x2 * c0); 00309 /* acc3 += x[3] * y[0] */ 00310 acc3 += ((q63_t) x3 * c0); 00311 00312 /* Read y[1] sample */ 00313 c0 = *(py++); 00314 00315 /* Read x[4] sample */ 00316 x0 = *(px++); 00317 00318 /* Perform the multiply-accumulates */ 00319 /* acc0 += x[1] * y[1] */ 00320 acc0 += ((q63_t) x1 * c0); 00321 /* acc1 += x[2] * y[1] */ 00322 acc1 += ((q63_t) x2 * c0); 00323 /* acc2 += x[3] * y[1] */ 00324 acc2 += ((q63_t) x3 * c0); 00325 /* acc3 += x[4] * y[1] */ 00326 acc3 += ((q63_t) x0 * c0); 00327 /* Read y[2] sample */ 00328 c0 = *(py++); 00329 00330 /* Read x[5] sample */ 00331 x1 = *(px++); 00332 00333 /* Perform the multiply-accumulates */ 00334 /* acc0 += x[2] * y[2] */ 00335 acc0 += ((q63_t) x2 * c0); 00336 /* acc1 += x[3] * y[2] */ 00337 acc1 += ((q63_t) x3 * c0); 00338 /* acc2 += x[4] * y[2] */ 00339 acc2 += ((q63_t) x0 * c0); 00340 /* acc3 += x[5] * y[2] */ 00341 acc3 += ((q63_t) x1 * c0); 00342 00343 /* Read y[3] sample */ 00344 c0 = *(py++); 00345 00346 /* Read x[6] sample */ 00347 x2 = *(px++); 00348 00349 /* Perform the multiply-accumulates */ 00350 /* acc0 += x[3] * y[3] */ 00351 acc0 += ((q63_t) x3 * c0); 00352 /* acc1 += x[4] * y[3] */ 00353 acc1 += ((q63_t) x0 * c0); 00354 /* acc2 += x[5] * y[3] */ 00355 acc2 += ((q63_t) x1 * c0); 00356 /* acc3 += x[6] * y[3] */ 00357 acc3 += ((q63_t) x2 * c0); 00358 00359 00360 } while(--k); 00361 00362 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00363 ** No loop unrolling is used. */ 00364 k = srcBLen % 0x4u; 00365 00366 while(k > 0u) 00367 { 00368 /* Read y[4] sample */ 00369 c0 = *(py++); 00370 00371 /* Read x[7] sample */ 00372 x3 = *(px++); 00373 00374 /* Perform the multiply-accumulates */ 00375 /* acc0 += x[4] * y[4] */ 00376 acc0 += ((q63_t) x0 * c0); 00377 /* acc1 += x[5] * y[4] */ 00378 acc1 += ((q63_t) x1 * c0); 00379 /* acc2 += x[6] * y[4] */ 00380 acc2 += ((q63_t) x2 * c0); 00381 /* acc3 += x[7] * y[4] */ 00382 acc3 += ((q63_t) x3 * c0); 00383 00384 /* Reuse the present samples for the next MAC */ 00385 x0 = x1; 00386 x1 = x2; 00387 x2 = x3; 00388 00389 /* Decrement the loop counter */ 00390 k--; 00391 } 00392 00393 /* Store the result in the accumulator in the destination buffer. */ 00394 *pOut = (q31_t) (acc0 >> 31); 00395 /* Destination pointer is updated according to the address modifier, inc */ 00396 pOut += inc; 00397 00398 *pOut = (q31_t) (acc1 >> 31); 00399 pOut += inc; 00400 00401 *pOut = (q31_t) (acc2 >> 31); 00402 pOut += inc; 00403 00404 *pOut = (q31_t) (acc3 >> 31); 00405 pOut += inc; 00406 00407 /* Update the inputA and inputB pointers for next MAC calculation */ 00408 px = pIn1 + (count * 4u); 00409 py = pIn2; 00410 00411 /* Increment the pointer pIn1 index, count by 1 */ 00412 count++; 00413 00414 /* Decrement the loop counter */ 00415 blkCnt--; 00416 } 00417 00418 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00419 ** No loop unrolling is used. */ 00420 blkCnt = blockSize2 % 0x4u; 00421 00422 while(blkCnt > 0u) 00423 { 00424 /* Accumulator is made zero for every iteration */ 00425 sum = 0; 00426 00427 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00428 k = srcBLen >> 2u; 00429 00430 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00431 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00432 while(k > 0u) 00433 { 00434 /* Perform the multiply-accumulates */ 00435 sum += (q63_t) * px++ * (*py++); 00436 sum += (q63_t) * px++ * (*py++); 00437 sum += (q63_t) * px++ * (*py++); 00438 sum += (q63_t) * px++ * (*py++); 00439 00440 /* Decrement the loop counter */ 00441 k--; 00442 } 00443 00444 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00445 ** No loop unrolling is used. */ 00446 k = srcBLen % 0x4u; 00447 00448 while(k > 0u) 00449 { 00450 /* Perform the multiply-accumulate */ 00451 sum += (q63_t) * px++ * (*py++); 00452 00453 /* Decrement the loop counter */ 00454 k--; 00455 } 00456 00457 /* Store the result in the accumulator in the destination buffer. */ 00458 *pOut = (q31_t) (sum >> 31); 00459 /* Destination pointer is updated according to the address modifier, inc */ 00460 pOut += inc; 00461 00462 /* Update the inputA and inputB pointers for next MAC calculation */ 00463 px = pIn1 + count; 00464 py = pIn2; 00465 00466 /* Increment the MAC count */ 00467 count++; 00468 00469 /* Decrement the loop counter */ 00470 blkCnt--; 00471 } 00472 } 00473 else 00474 { 00475 /* If the srcBLen is not a multiple of 4, 00476 * the blockSize2 loop cannot be unrolled by 4 */ 00477 blkCnt = blockSize2; 00478 00479 while(blkCnt > 0u) 00480 { 00481 /* Accumulator is made zero for every iteration */ 00482 sum = 0; 00483 00484 /* Loop over srcBLen */ 00485 k = srcBLen; 00486 00487 while(k > 0u) 00488 { 00489 /* Perform the multiply-accumulate */ 00490 sum += (q63_t) * px++ * (*py++); 00491 00492 /* Decrement the loop counter */ 00493 k--; 00494 } 00495 00496 /* Store the result in the accumulator in the destination buffer. */ 00497 *pOut = (q31_t) (sum >> 31); 00498 /* Destination pointer is updated according to the address modifier, inc */ 00499 pOut += inc; 00500 00501 /* Update the inputA and inputB pointers for next MAC calculation */ 00502 px = pIn1 + count; 00503 py = pIn2; 00504 00505 /* Increment the MAC count */ 00506 count++; 00507 00508 /* Decrement the loop counter */ 00509 blkCnt--; 00510 } 00511 } 00512 00513 /* -------------------------- 00514 * Initializations of stage3 00515 * -------------------------*/ 00516 00517 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00518 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00519 * .... 00520 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00521 * sum += x[srcALen-1] * y[0] 00522 */ 00523 00524 /* In this stage the MAC operations are decreased by 1 for every iteration. 00525 The count variable holds the number of MAC operations performed */ 00526 count = srcBLen - 1u; 00527 00528 /* Working pointer of inputA */ 00529 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00530 px = pSrc1; 00531 00532 /* Working pointer of inputB */ 00533 py = pIn2; 00534 00535 /* ------------------- 00536 * Stage3 process 00537 * ------------------*/ 00538 00539 while(blockSize3 > 0u) 00540 { 00541 /* Accumulator is made zero for every iteration */ 00542 sum = 0; 00543 00544 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00545 k = count >> 2u; 00546 00547 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00548 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00549 while(k > 0u) 00550 { 00551 /* Perform the multiply-accumulates */ 00552 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00553 sum += (q63_t) * px++ * (*py++); 00554 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00555 sum += (q63_t) * px++ * (*py++); 00556 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00557 sum += (q63_t) * px++ * (*py++); 00558 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00559 sum += (q63_t) * px++ * (*py++); 00560 00561 /* Decrement the loop counter */ 00562 k--; 00563 } 00564 00565 /* If the count is not a multiple of 4, compute any remaining MACs here. 00566 ** No loop unrolling is used. */ 00567 k = count % 0x4u; 00568 00569 while(k > 0u) 00570 { 00571 /* Perform the multiply-accumulates */ 00572 sum += (q63_t) * px++ * (*py++); 00573 00574 /* Decrement the loop counter */ 00575 k--; 00576 } 00577 00578 /* Store the result in the accumulator in the destination buffer. */ 00579 *pOut = (q31_t) (sum >> 31); 00580 /* Destination pointer is updated according to the address modifier, inc */ 00581 pOut += inc; 00582 00583 /* Update the inputA and inputB pointers for next MAC calculation */ 00584 px = ++pSrc1; 00585 py = pIn2; 00586 00587 /* Decrement the MAC count */ 00588 count--; 00589 00590 /* Decrement the loop counter */ 00591 blockSize3--; 00592 } 00593 00594 #else 00595 00596 /* Run the below code for Cortex-M0 */ 00597 00598 q31_t *pIn1 = pSrcA; /* inputA pointer */ 00599 q31_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ 00600 q63_t sum; /* Accumulators */ 00601 uint32_t i = 0u, j; /* loop counters */ 00602 uint32_t inv = 0u; /* Reverse order flag */ 00603 uint32_t tot = 0u; /* Length */ 00604 00605 /* The algorithm implementation is based on the lengths of the inputs. */ 00606 /* srcB is always made to slide across srcA. */ 00607 /* So srcBLen is always considered as shorter or equal to srcALen */ 00608 /* But CORR(x, y) is reverse of CORR(y, x) */ 00609 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00610 /* and a varaible, inv is set to 1 */ 00611 /* If lengths are not equal then zero pad has to be done to make the two 00612 * inputs of same length. But to improve the performance, we include zeroes 00613 * in the output instead of zero padding either of the the inputs*/ 00614 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the 00615 * starting of the output buffer */ 00616 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the 00617 * ending of the output buffer */ 00618 /* Once the zero padding is done the remaining of the output is calcualted 00619 * using convolution but with the shorter signal time shifted. */ 00620 00621 /* Calculate the length of the remaining sequence */ 00622 tot = ((srcALen + srcBLen) - 2u); 00623 00624 if(srcALen > srcBLen) 00625 { 00626 /* Calculating the number of zeros to be padded to the output */ 00627 j = srcALen - srcBLen; 00628 00629 /* Initialise the pointer after zero padding */ 00630 pDst += j; 00631 } 00632 00633 else if(srcALen < srcBLen) 00634 { 00635 /* Initialization to inputB pointer */ 00636 pIn1 = pSrcB; 00637 00638 /* Initialization to the end of inputA pointer */ 00639 pIn2 = pSrcA + (srcALen - 1u); 00640 00641 /* Initialisation of the pointer after zero padding */ 00642 pDst = pDst + tot; 00643 00644 /* Swapping the lengths */ 00645 j = srcALen; 00646 srcALen = srcBLen; 00647 srcBLen = j; 00648 00649 /* Setting the reverse flag */ 00650 inv = 1; 00651 00652 } 00653 00654 /* Loop to calculate convolution for output length number of times */ 00655 for (i = 0u; i <= tot; i++) 00656 { 00657 /* Initialize sum with zero to carry on MAC operations */ 00658 sum = 0; 00659 00660 /* Loop to perform MAC operations according to convolution equation */ 00661 for (j = 0u; j <= i; j++) 00662 { 00663 /* Check the array limitations */ 00664 if((((i - j) < srcBLen) && (j < srcALen))) 00665 { 00666 /* z[i] += x[i-j] * y[j] */ 00667 sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]); 00668 } 00669 } 00670 /* Store the output in the destination buffer */ 00671 if(inv == 1) 00672 *pDst-- = (q31_t) (sum >> 31u); 00673 else 00674 *pDst++ = (q31_t) (sum >> 31u); 00675 } 00676 00677 #endif /* #ifndef ARM_MATH_CM0 */ 00678 00679 } 00680