00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q31.c 00009 * 00010 * Description: Partial convolution of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00059 arm_status arm_conv_partial_q31( 00060 q31_t * pSrcA, 00061 uint32_t srcALen, 00062 q31_t * pSrcB, 00063 uint32_t srcBLen, 00064 q31_t * pDst, 00065 uint32_t firstIndex, 00066 uint32_t numPoints) 00067 { 00068 00069 00070 #ifndef ARM_MATH_CM0 00071 00072 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00073 00074 q31_t *pIn1; /* inputA pointer */ 00075 q31_t *pIn2; /* inputB pointer */ 00076 q31_t *pOut = pDst; /* output pointer */ 00077 q31_t *px; /* Intermediate inputA pointer */ 00078 q31_t *py; /* Intermediate inputB pointer */ 00079 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00080 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00081 q31_t x0, x1, x2, x3, c0; 00082 uint32_t j, k, count, check, blkCnt; 00083 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00084 arm_status status; /* status of Partial convolution */ 00085 00086 00087 /* Check for range of output samples to be calculated */ 00088 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00089 { 00090 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00091 status = ARM_MATH_ARGUMENT_ERROR; 00092 } 00093 else 00094 { 00095 00096 /* The algorithm implementation is based on the lengths of the inputs. */ 00097 /* srcB is always made to slide across srcA. */ 00098 /* So srcBLen is always considered as shorter or equal to srcALen */ 00099 if(srcALen >= srcBLen) 00100 { 00101 /* Initialization of inputA pointer */ 00102 pIn1 = pSrcA; 00103 00104 /* Initialization of inputB pointer */ 00105 pIn2 = pSrcB; 00106 } 00107 else 00108 { 00109 /* Initialization of inputA pointer */ 00110 pIn1 = pSrcB; 00111 00112 /* Initialization of inputB pointer */ 00113 pIn2 = pSrcA; 00114 00115 /* srcBLen is always considered as shorter or equal to srcALen */ 00116 j = srcBLen; 00117 srcBLen = srcALen; 00118 srcALen = j; 00119 } 00120 00121 /* Conditions to check which loopCounter holds 00122 * the first and last indices of the output samples to be calculated. */ 00123 check = firstIndex + numPoints; 00124 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00125 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00126 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00127 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00128 (int32_t) numPoints) : 0; 00129 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00130 (int32_t) firstIndex); 00131 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00132 00133 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00134 /* The function is internally 00135 * divided into three stages according to the number of multiplications that has to be 00136 * taken place between inputA samples and inputB samples. In the first stage of the 00137 * algorithm, the multiplications increase by one for every iteration. 00138 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00139 * In the third stage of the algorithm, the multiplications decrease by one 00140 * for every iteration. */ 00141 00142 /* Set the output pointer to point to the firstIndex 00143 * of the output sample to be calculated. */ 00144 pOut = pDst + firstIndex; 00145 00146 /* -------------------------- 00147 * Initializations of stage1 00148 * -------------------------*/ 00149 00150 /* sum = x[0] * y[0] 00151 * sum = x[0] * y[1] + x[1] * y[0] 00152 * .... 00153 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00154 */ 00155 00156 /* In this stage the MAC operations are increased by 1 for every iteration. 00157 The count variable holds the number of MAC operations performed. 00158 Since the partial convolution starts from firstIndex 00159 Number of Macs to be performed is firstIndex + 1 */ 00160 count = 1u + firstIndex; 00161 00162 /* Working pointer of inputA */ 00163 px = pIn1; 00164 00165 /* Working pointer of inputB */ 00166 pSrc2 = pIn2 + firstIndex; 00167 py = pSrc2; 00168 00169 /* ------------------------ 00170 * Stage1 process 00171 * ----------------------*/ 00172 00173 /* The first loop starts here */ 00174 while(blockSize1 > 0) 00175 { 00176 /* Accumulator is made zero for every iteration */ 00177 sum = 0; 00178 00179 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00180 k = count >> 2u; 00181 00182 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00183 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00184 while(k > 0u) 00185 { 00186 /* x[0] * y[srcBLen - 1] */ 00187 sum += (q63_t) * px++ * (*py--); 00188 /* x[1] * y[srcBLen - 2] */ 00189 sum += (q63_t) * px++ * (*py--); 00190 /* x[2] * y[srcBLen - 3] */ 00191 sum += (q63_t) * px++ * (*py--); 00192 /* x[3] * y[srcBLen - 4] */ 00193 sum += (q63_t) * px++ * (*py--); 00194 00195 /* Decrement the loop counter */ 00196 k--; 00197 } 00198 00199 /* If the count is not a multiple of 4, compute any remaining MACs here. 00200 ** No loop unrolling is used. */ 00201 k = count % 0x4u; 00202 00203 while(k > 0u) 00204 { 00205 /* Perform the multiply-accumulate */ 00206 sum += (q63_t) * px++ * (*py--); 00207 00208 /* Decrement the loop counter */ 00209 k--; 00210 } 00211 00212 /* Store the result in the accumulator in the destination buffer. */ 00213 *pOut++ = (q31_t) (sum >> 31); 00214 00215 /* Update the inputA and inputB pointers for next MAC calculation */ 00216 py = ++pSrc2; 00217 px = pIn1; 00218 00219 /* Increment the MAC count */ 00220 count++; 00221 00222 /* Decrement the loop counter */ 00223 blockSize1--; 00224 } 00225 00226 /* -------------------------- 00227 * Initializations of stage2 00228 * ------------------------*/ 00229 00230 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00231 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00232 * .... 00233 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00234 */ 00235 00236 /* Working pointer of inputA */ 00237 px = pIn1; 00238 00239 /* Working pointer of inputB */ 00240 pSrc2 = pIn2 + (srcBLen - 1u); 00241 py = pSrc2; 00242 00243 /* count is index by which the pointer pIn1 to be incremented */ 00244 count = 1u; 00245 00246 /* ------------------- 00247 * Stage2 process 00248 * ------------------*/ 00249 00250 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00251 * So, to loop unroll over blockSize2, 00252 * srcBLen should be greater than or equal to 4 */ 00253 if(srcBLen >= 4u) 00254 { 00255 /* Loop unroll over blockSize2 */ 00256 blkCnt = ((uint32_t) blockSize2 >> 2u); 00257 00258 while(blkCnt > 0u) 00259 { 00260 /* Set all accumulators to zero */ 00261 acc0 = 0; 00262 acc1 = 0; 00263 acc2 = 0; 00264 acc3 = 0; 00265 00266 /* read x[0], x[1], x[2] samples */ 00267 x0 = *(px++); 00268 x1 = *(px++); 00269 x2 = *(px++); 00270 00271 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00272 k = srcBLen >> 2u; 00273 00274 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00275 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00276 do 00277 { 00278 /* Read y[srcBLen - 1] sample */ 00279 c0 = *(py--); 00280 00281 /* Read x[3] sample */ 00282 x3 = *(px++); 00283 00284 /* Perform the multiply-accumulates */ 00285 /* acc0 += x[0] * y[srcBLen - 1] */ 00286 acc0 += (q63_t) x0 *c0; 00287 /* acc1 += x[1] * y[srcBLen - 1] */ 00288 acc1 += (q63_t) x1 *c0; 00289 /* acc2 += x[2] * y[srcBLen - 1] */ 00290 acc2 += (q63_t) x2 *c0; 00291 /* acc3 += x[3] * y[srcBLen - 1] */ 00292 acc3 += (q63_t) x3 *c0; 00293 00294 /* Read y[srcBLen - 2] sample */ 00295 c0 = *(py--); 00296 00297 /* Read x[4] sample */ 00298 x0 = *(px++); 00299 00300 /* Perform the multiply-accumulate */ 00301 /* acc0 += x[1] * y[srcBLen - 2] */ 00302 acc0 += (q63_t) x1 *c0; 00303 /* acc1 += x[2] * y[srcBLen - 2] */ 00304 acc1 += (q63_t) x2 *c0; 00305 /* acc2 += x[3] * y[srcBLen - 2] */ 00306 acc2 += (q63_t) x3 *c0; 00307 /* acc3 += x[4] * y[srcBLen - 2] */ 00308 acc3 += (q63_t) x0 *c0; 00309 00310 /* Read y[srcBLen - 3] sample */ 00311 c0 = *(py--); 00312 00313 /* Read x[5] sample */ 00314 x1 = *(px++); 00315 00316 /* Perform the multiply-accumulates */ 00317 /* acc0 += x[2] * y[srcBLen - 3] */ 00318 acc0 += (q63_t) x2 *c0; 00319 /* acc1 += x[3] * y[srcBLen - 2] */ 00320 acc1 += (q63_t) x3 *c0; 00321 /* acc2 += x[4] * y[srcBLen - 2] */ 00322 acc2 += (q63_t) x0 *c0; 00323 /* acc3 += x[5] * y[srcBLen - 2] */ 00324 acc3 += (q63_t) x1 *c0; 00325 00326 /* Read y[srcBLen - 4] sample */ 00327 c0 = *(py--); 00328 00329 /* Read x[6] sample */ 00330 x2 = *(px++); 00331 00332 /* Perform the multiply-accumulates */ 00333 /* acc0 += x[3] * y[srcBLen - 4] */ 00334 acc0 += (q63_t) x3 *c0; 00335 /* acc1 += x[4] * y[srcBLen - 4] */ 00336 acc1 += (q63_t) x0 *c0; 00337 /* acc2 += x[5] * y[srcBLen - 4] */ 00338 acc2 += (q63_t) x1 *c0; 00339 /* acc3 += x[6] * y[srcBLen - 4] */ 00340 acc3 += (q63_t) x2 *c0; 00341 00342 } while(--k); 00343 00344 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00345 ** No loop unrolling is used. */ 00346 k = srcBLen % 0x4u; 00347 00348 while(k > 0u) 00349 { 00350 /* Read y[srcBLen - 5] sample */ 00351 c0 = *(py--); 00352 00353 /* Read x[7] sample */ 00354 x3 = *(px++); 00355 00356 /* Perform the multiply-accumulates */ 00357 /* acc0 += x[4] * y[srcBLen - 5] */ 00358 acc0 += (q63_t) x0 *c0; 00359 /* acc1 += x[5] * y[srcBLen - 5] */ 00360 acc1 += (q63_t) x1 *c0; 00361 /* acc2 += x[6] * y[srcBLen - 5] */ 00362 acc2 += (q63_t) x2 *c0; 00363 /* acc3 += x[7] * y[srcBLen - 5] */ 00364 acc3 += (q63_t) x3 *c0; 00365 00366 /* Reuse the present samples for the next MAC */ 00367 x0 = x1; 00368 x1 = x2; 00369 x2 = x3; 00370 00371 /* Decrement the loop counter */ 00372 k--; 00373 } 00374 00375 /* Store the result in the accumulator in the destination buffer. */ 00376 *pOut++ = (q31_t) (acc0 >> 31); 00377 *pOut++ = (q31_t) (acc1 >> 31); 00378 *pOut++ = (q31_t) (acc2 >> 31); 00379 *pOut++ = (q31_t) (acc3 >> 31); 00380 00381 /* Update the inputA and inputB pointers for next MAC calculation */ 00382 px = pIn1 + (count * 4u); 00383 py = pSrc2; 00384 00385 /* Increment the pointer pIn1 index, count by 1 */ 00386 count++; 00387 00388 /* Decrement the loop counter */ 00389 blkCnt--; 00390 } 00391 00392 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00393 ** No loop unrolling is used. */ 00394 blkCnt = (uint32_t) blockSize2 % 0x4u; 00395 00396 while(blkCnt > 0u) 00397 { 00398 /* Accumulator is made zero for every iteration */ 00399 sum = 0; 00400 00401 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00402 k = srcBLen >> 2u; 00403 00404 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00405 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00406 while(k > 0u) 00407 { 00408 /* Perform the multiply-accumulates */ 00409 sum += (q63_t) * px++ * (*py--); 00410 sum += (q63_t) * px++ * (*py--); 00411 sum += (q63_t) * px++ * (*py--); 00412 sum += (q63_t) * px++ * (*py--); 00413 00414 /* Decrement the loop counter */ 00415 k--; 00416 } 00417 00418 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00419 ** No loop unrolling is used. */ 00420 k = srcBLen % 0x4u; 00421 00422 while(k > 0u) 00423 { 00424 /* Perform the multiply-accumulate */ 00425 sum += (q63_t) * px++ * (*py--); 00426 00427 /* Decrement the loop counter */ 00428 k--; 00429 } 00430 00431 /* Store the result in the accumulator in the destination buffer. */ 00432 *pOut++ = (q31_t) (sum >> 31); 00433 00434 /* Update the inputA and inputB pointers for next MAC calculation */ 00435 px = pIn1 + count; 00436 py = pSrc2; 00437 00438 /* Increment the MAC count */ 00439 count++; 00440 00441 /* Decrement the loop counter */ 00442 blkCnt--; 00443 } 00444 } 00445 else 00446 { 00447 /* If the srcBLen is not a multiple of 4, 00448 * the blockSize2 loop cannot be unrolled by 4 */ 00449 blkCnt = (uint32_t) blockSize2; 00450 00451 while(blkCnt > 0u) 00452 { 00453 /* Accumulator is made zero for every iteration */ 00454 sum = 0; 00455 00456 /* srcBLen number of MACS should be performed */ 00457 k = srcBLen; 00458 00459 while(k > 0u) 00460 { 00461 /* Perform the multiply-accumulate */ 00462 sum += (q63_t) * px++ * (*py--); 00463 00464 /* Decrement the loop counter */ 00465 k--; 00466 } 00467 00468 /* Store the result in the accumulator in the destination buffer. */ 00469 *pOut++ = (q31_t) (sum >> 31); 00470 00471 /* Update the inputA and inputB pointers for next MAC calculation */ 00472 px = pIn1 + count; 00473 py = pSrc2; 00474 00475 /* Increment the MAC count */ 00476 count++; 00477 00478 /* Decrement the loop counter */ 00479 blkCnt--; 00480 } 00481 } 00482 00483 00484 /* -------------------------- 00485 * Initializations of stage3 00486 * -------------------------*/ 00487 00488 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00489 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00490 * .... 00491 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00492 * sum += x[srcALen-1] * y[srcBLen-1] 00493 */ 00494 00495 /* In this stage the MAC operations are decreased by 1 for every iteration. 00496 The blockSize3 variable holds the number of MAC operations performed */ 00497 count = srcBLen - 1u; 00498 00499 /* Working pointer of inputA */ 00500 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00501 px = pSrc1; 00502 00503 /* Working pointer of inputB */ 00504 pSrc2 = pIn2 + (srcBLen - 1u); 00505 py = pSrc2; 00506 00507 /* ------------------- 00508 * Stage3 process 00509 * ------------------*/ 00510 00511 while(blockSize3 > 0) 00512 { 00513 /* Accumulator is made zero for every iteration */ 00514 sum = 0; 00515 00516 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00517 k = count >> 2u; 00518 00519 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00520 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00521 while(k > 0u) 00522 { 00523 sum += (q63_t) * px++ * (*py--); 00524 sum += (q63_t) * px++ * (*py--); 00525 sum += (q63_t) * px++ * (*py--); 00526 sum += (q63_t) * px++ * (*py--); 00527 00528 /* Decrement the loop counter */ 00529 k--; 00530 } 00531 00532 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00533 ** No loop unrolling is used. */ 00534 k = count % 0x4u; 00535 00536 while(k > 0u) 00537 { 00538 /* Perform the multiply-accumulate */ 00539 sum += (q63_t) * px++ * (*py--); 00540 00541 /* Decrement the loop counter */ 00542 k--; 00543 } 00544 00545 /* Store the result in the accumulator in the destination buffer. */ 00546 *pOut++ = (q31_t) (sum >> 31); 00547 00548 /* Update the inputA and inputB pointers for next MAC calculation */ 00549 px = ++pSrc1; 00550 py = pSrc2; 00551 00552 /* Decrement the MAC count */ 00553 count--; 00554 00555 /* Decrement the loop counter */ 00556 blockSize3--; 00557 00558 } 00559 00560 /* set status as ARM_MATH_SUCCESS */ 00561 status = ARM_MATH_SUCCESS; 00562 } 00563 00564 /* Return to application */ 00565 return (status); 00566 00567 #else 00568 00569 /* Run the below code for Cortex-M0 */ 00570 00571 q31_t *pIn1 = pSrcA; /* inputA pointer */ 00572 q31_t *pIn2 = pSrcB; /* inputB pointer */ 00573 q63_t sum; /* Accumulator */ 00574 uint32_t i, j; /* loop counters */ 00575 arm_status status; /* status of Partial convolution */ 00576 00577 /* Check for range of output samples to be calculated */ 00578 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00579 { 00580 /* Set status as ARM_ARGUMENT_ERROR */ 00581 status = ARM_MATH_ARGUMENT_ERROR; 00582 } 00583 else 00584 { 00585 /* Loop to calculate convolution for output length number of values */ 00586 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00587 { 00588 /* Initialize sum with zero to carry on MAC operations */ 00589 sum = 0; 00590 00591 /* Loop to perform MAC operations according to convolution equation */ 00592 for (j = 0; j <= i; j++) 00593 { 00594 /* Check the array limitations */ 00595 if(((i - j) < srcBLen) && (j < srcALen)) 00596 { 00597 /* z[i] += x[i-j] * y[j] */ 00598 sum += ((q63_t) pIn1[j] * (pIn2[i - j])); 00599 } 00600 } 00601 00602 /* Store the output in the destination buffer */ 00603 pDst[i] = (q31_t) (sum >> 31u); 00604 } 00605 /* set status as ARM_SUCCESS as there are no argument errors */ 00606 status = ARM_MATH_SUCCESS; 00607 } 00608 return (status); 00609 00610 #endif /* #ifndef ARM_MATH_CM0 */ 00611 00612 } 00613