00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_q31.c 00009 * 00010 * Description: Fast Q31 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * -------------------------------------------------------------------- */ 00029 00030 #include "arm_math.h" 00031 00056 arm_status arm_conv_partial_fast_q31( 00057 q31_t * pSrcA, 00058 uint32_t srcALen, 00059 q31_t * pSrcB, 00060 uint32_t srcBLen, 00061 q31_t * pDst, 00062 uint32_t firstIndex, 00063 uint32_t numPoints) 00064 { 00065 q31_t *pIn1; /* inputA pointer */ 00066 q31_t *pIn2; /* inputB pointer */ 00067 q31_t *pOut = pDst; /* output pointer */ 00068 q31_t *px; /* Intermediate inputA pointer */ 00069 q31_t *py; /* Intermediate inputB pointer */ 00070 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00071 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00072 q31_t x0, x1, x2, x3, c0; 00073 uint32_t j, k, count, check, blkCnt; 00074 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00075 arm_status status; /* status of Partial convolution */ 00076 00077 00078 /* Check for range of output samples to be calculated */ 00079 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00080 { 00081 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00082 status = ARM_MATH_ARGUMENT_ERROR; 00083 } 00084 else 00085 { 00086 00087 /* The algorithm implementation is based on the lengths of the inputs. */ 00088 /* srcB is always made to slide across srcA. */ 00089 /* So srcBLen is always considered as shorter or equal to srcALen */ 00090 if(srcALen >= srcBLen) 00091 { 00092 /* Initialization of inputA pointer */ 00093 pIn1 = pSrcA; 00094 00095 /* Initialization of inputB pointer */ 00096 pIn2 = pSrcB; 00097 } 00098 else 00099 { 00100 /* Initialization of inputA pointer */ 00101 pIn1 = pSrcB; 00102 00103 /* Initialization of inputB pointer */ 00104 pIn2 = pSrcA; 00105 00106 /* srcBLen is always considered as shorter or equal to srcALen */ 00107 j = srcBLen; 00108 srcBLen = srcALen; 00109 srcALen = j; 00110 } 00111 00112 /* Conditions to check which loopCounter holds 00113 * the first and last indices of the output samples to be calculated. */ 00114 check = firstIndex + numPoints; 00115 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00116 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00117 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00118 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00119 (int32_t) numPoints) : 0; 00120 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00121 (int32_t) firstIndex); 00122 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00123 00124 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00125 /* The function is internally 00126 * divided into three stages according to the number of multiplications that has to be 00127 * taken place between inputA samples and inputB samples. In the first stage of the 00128 * algorithm, the multiplications increase by one for every iteration. 00129 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00130 * In the third stage of the algorithm, the multiplications decrease by one 00131 * for every iteration. */ 00132 00133 /* Set the output pointer to point to the firstIndex 00134 * of the output sample to be calculated. */ 00135 pOut = pDst + firstIndex; 00136 00137 /* -------------------------- 00138 * Initializations of stage1 00139 * -------------------------*/ 00140 00141 /* sum = x[0] * y[0] 00142 * sum = x[0] * y[1] + x[1] * y[0] 00143 * .... 00144 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00145 */ 00146 00147 /* In this stage the MAC operations are increased by 1 for every iteration. 00148 The count variable holds the number of MAC operations performed. 00149 Since the partial convolution starts from firstIndex 00150 Number of Macs to be performed is firstIndex + 1 */ 00151 count = 1u + firstIndex; 00152 00153 /* Working pointer of inputA */ 00154 px = pIn1; 00155 00156 /* Working pointer of inputB */ 00157 pSrc2 = pIn2 + firstIndex; 00158 py = pSrc2; 00159 00160 /* ------------------------ 00161 * Stage1 process 00162 * ----------------------*/ 00163 00164 /* The first loop starts here */ 00165 while(blockSize1 > 0) 00166 { 00167 /* Accumulator is made zero for every iteration */ 00168 sum = 0; 00169 00170 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00171 k = count >> 2u; 00172 00173 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00174 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00175 while(k > 0u) 00176 { 00177 /* x[0] * y[srcBLen - 1] */ 00178 sum = (q31_t) ((((q63_t) sum << 32) + 00179 ((q63_t) * px++ * (*py--))) >> 32); 00180 00181 /* x[1] * y[srcBLen - 2] */ 00182 sum = (q31_t) ((((q63_t) sum << 32) + 00183 ((q63_t) * px++ * (*py--))) >> 32); 00184 00185 /* x[2] * y[srcBLen - 3] */ 00186 sum = (q31_t) ((((q63_t) sum << 32) + 00187 ((q63_t) * px++ * (*py--))) >> 32); 00188 00189 /* x[3] * y[srcBLen - 4] */ 00190 sum = (q31_t) ((((q63_t) sum << 32) + 00191 ((q63_t) * px++ * (*py--))) >> 32); 00192 00193 /* Decrement the loop counter */ 00194 k--; 00195 } 00196 00197 /* If the count is not a multiple of 4, compute any remaining MACs here. 00198 ** No loop unrolling is used. */ 00199 k = count % 0x4u; 00200 00201 while(k > 0u) 00202 { 00203 /* Perform the multiply-accumulates */ 00204 sum = (q31_t) ((((q63_t) sum << 32) + 00205 ((q63_t) * px++ * (*py--))) >> 32); 00206 00207 /* Decrement the loop counter */ 00208 k--; 00209 } 00210 00211 /* Store the result in the accumulator in the destination buffer. */ 00212 *pOut++ = sum << 1; 00213 00214 /* Update the inputA and inputB pointers for next MAC calculation */ 00215 py = ++pSrc2; 00216 px = pIn1; 00217 00218 /* Increment the MAC count */ 00219 count++; 00220 00221 /* Decrement the loop counter */ 00222 blockSize1--; 00223 } 00224 00225 /* -------------------------- 00226 * Initializations of stage2 00227 * ------------------------*/ 00228 00229 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00230 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00231 * .... 00232 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00233 */ 00234 00235 /* Working pointer of inputA */ 00236 px = pIn1; 00237 00238 /* Working pointer of inputB */ 00239 pSrc2 = pIn2 + (srcBLen - 1u); 00240 py = pSrc2; 00241 00242 /* count is index by which the pointer pIn1 to be incremented */ 00243 count = 1u; 00244 00245 /* ------------------- 00246 * Stage2 process 00247 * ------------------*/ 00248 00249 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00250 * So, to loop unroll over blockSize2, 00251 * srcBLen should be greater than or equal to 4 */ 00252 if(srcBLen >= 4u) 00253 { 00254 /* Loop unroll over blockSize2 */ 00255 blkCnt = ((uint32_t) blockSize2 >> 2u); 00256 00257 while(blkCnt > 0u) 00258 { 00259 /* Set all accumulators to zero */ 00260 acc0 = 0; 00261 acc1 = 0; 00262 acc2 = 0; 00263 acc3 = 0; 00264 00265 /* read x[0], x[1], x[2] samples */ 00266 x0 = *(px++); 00267 x1 = *(px++); 00268 x2 = *(px++); 00269 00270 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00271 k = srcBLen >> 2u; 00272 00273 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00274 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00275 do 00276 { 00277 /* Read y[srcBLen - 1] sample */ 00278 c0 = *(py--); 00279 00280 /* Read x[3] sample */ 00281 x3 = *(px++); 00282 00283 /* Perform the multiply-accumulate */ 00284 /* acc0 += x[0] * y[srcBLen - 1] */ 00285 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00286 00287 /* acc1 += x[1] * y[srcBLen - 1] */ 00288 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00289 00290 /* acc2 += x[2] * y[srcBLen - 1] */ 00291 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00292 00293 /* acc3 += x[3] * y[srcBLen - 1] */ 00294 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00295 00296 /* Read y[srcBLen - 2] sample */ 00297 c0 = *(py--); 00298 00299 /* Read x[4] sample */ 00300 x0 = *(px++); 00301 00302 /* Perform the multiply-accumulate */ 00303 /* acc0 += x[1] * y[srcBLen - 2] */ 00304 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00305 /* acc1 += x[2] * y[srcBLen - 2] */ 00306 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00307 /* acc2 += x[3] * y[srcBLen - 2] */ 00308 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00309 /* acc3 += x[4] * y[srcBLen - 2] */ 00310 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00311 00312 /* Read y[srcBLen - 3] sample */ 00313 c0 = *(py--); 00314 00315 /* Read x[5] sample */ 00316 x1 = *(px++); 00317 00318 /* Perform the multiply-accumulates */ 00319 /* acc0 += x[2] * y[srcBLen - 3] */ 00320 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00321 /* acc1 += x[3] * y[srcBLen - 2] */ 00322 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00323 /* acc2 += x[4] * y[srcBLen - 2] */ 00324 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00325 /* acc3 += x[5] * y[srcBLen - 2] */ 00326 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00327 00328 /* Read y[srcBLen - 4] sample */ 00329 c0 = *(py--); 00330 00331 /* Read x[6] sample */ 00332 x2 = *(px++); 00333 00334 /* Perform the multiply-accumulates */ 00335 /* acc0 += x[3] * y[srcBLen - 4] */ 00336 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00337 /* acc1 += x[4] * y[srcBLen - 4] */ 00338 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00339 /* acc2 += x[5] * y[srcBLen - 4] */ 00340 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00341 /* acc3 += x[6] * y[srcBLen - 4] */ 00342 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00343 00344 00345 } while(--k); 00346 00347 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00348 ** No loop unrolling is used. */ 00349 k = srcBLen % 0x4u; 00350 00351 while(k > 0u) 00352 { 00353 /* Read y[srcBLen - 5] sample */ 00354 c0 = *(py--); 00355 00356 /* Read x[7] sample */ 00357 x3 = *(px++); 00358 00359 /* Perform the multiply-accumulates */ 00360 /* acc0 += x[4] * y[srcBLen - 5] */ 00361 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00362 /* acc1 += x[5] * y[srcBLen - 5] */ 00363 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00364 /* acc2 += x[6] * y[srcBLen - 5] */ 00365 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00366 /* acc3 += x[7] * y[srcBLen - 5] */ 00367 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00368 00369 /* Reuse the present samples for the next MAC */ 00370 x0 = x1; 00371 x1 = x2; 00372 x2 = x3; 00373 00374 /* Decrement the loop counter */ 00375 k--; 00376 } 00377 00378 /* Store the result in the accumulator in the destination buffer. */ 00379 *pOut++ = (q31_t) (acc0 << 1); 00380 *pOut++ = (q31_t) (acc1 << 1); 00381 *pOut++ = (q31_t) (acc2 << 1); 00382 *pOut++ = (q31_t) (acc3 << 1); 00383 00384 /* Update the inputA and inputB pointers for next MAC calculation */ 00385 px = pIn1 + (count * 4u); 00386 py = pSrc2; 00387 00388 /* Increment the pointer pIn1 index, count by 1 */ 00389 count++; 00390 00391 /* Decrement the loop counter */ 00392 blkCnt--; 00393 } 00394 00395 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00396 ** No loop unrolling is used. */ 00397 blkCnt = (uint32_t) blockSize2 % 0x4u; 00398 00399 while(blkCnt > 0u) 00400 { 00401 /* Accumulator is made zero for every iteration */ 00402 sum = 0; 00403 00404 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00405 k = srcBLen >> 2u; 00406 00407 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00408 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00409 while(k > 0u) 00410 { 00411 /* Perform the multiply-accumulates */ 00412 sum = (q31_t) ((((q63_t) sum << 32) + 00413 ((q63_t) * px++ * (*py--))) >> 32); 00414 sum = (q31_t) ((((q63_t) sum << 32) + 00415 ((q63_t) * px++ * (*py--))) >> 32); 00416 sum = (q31_t) ((((q63_t) sum << 32) + 00417 ((q63_t) * px++ * (*py--))) >> 32); 00418 sum = (q31_t) ((((q63_t) sum << 32) + 00419 ((q63_t) * px++ * (*py--))) >> 32); 00420 00421 /* Decrement the loop counter */ 00422 k--; 00423 } 00424 00425 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00426 ** No loop unrolling is used. */ 00427 k = srcBLen % 0x4u; 00428 00429 while(k > 0u) 00430 { 00431 /* Perform the multiply-accumulate */ 00432 sum = (q31_t) ((((q63_t) sum << 32) + 00433 ((q63_t) * px++ * (*py--))) >> 32); 00434 00435 /* Decrement the loop counter */ 00436 k--; 00437 } 00438 00439 /* Store the result in the accumulator in the destination buffer. */ 00440 *pOut++ = sum << 1; 00441 00442 /* Update the inputA and inputB pointers for next MAC calculation */ 00443 px = pIn1 + count; 00444 py = pSrc2; 00445 00446 /* Increment the MAC count */ 00447 count++; 00448 00449 /* Decrement the loop counter */ 00450 blkCnt--; 00451 } 00452 } 00453 else 00454 { 00455 /* If the srcBLen is not a multiple of 4, 00456 * the blockSize2 loop cannot be unrolled by 4 */ 00457 blkCnt = (uint32_t) blockSize2; 00458 00459 while(blkCnt > 0u) 00460 { 00461 /* Accumulator is made zero for every iteration */ 00462 sum = 0; 00463 00464 /* srcBLen number of MACS should be performed */ 00465 k = srcBLen; 00466 00467 while(k > 0u) 00468 { 00469 /* Perform the multiply-accumulate */ 00470 sum = (q31_t) ((((q63_t) sum << 32) + 00471 ((q63_t) * px++ * (*py--))) >> 32); 00472 00473 /* Decrement the loop counter */ 00474 k--; 00475 } 00476 00477 /* Store the result in the accumulator in the destination buffer. */ 00478 *pOut++ = sum << 1; 00479 00480 /* Update the inputA and inputB pointers for next MAC calculation */ 00481 px = pIn1 + count; 00482 py = pSrc2; 00483 00484 /* Increment the MAC count */ 00485 count++; 00486 00487 /* Decrement the loop counter */ 00488 blkCnt--; 00489 } 00490 } 00491 00492 00493 /* -------------------------- 00494 * Initializations of stage3 00495 * -------------------------*/ 00496 00497 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00498 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00499 * .... 00500 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00501 * sum += x[srcALen-1] * y[srcBLen-1] 00502 */ 00503 00504 /* In this stage the MAC operations are decreased by 1 for every iteration. 00505 The count variable holds the number of MAC operations performed */ 00506 count = srcBLen - 1u; 00507 00508 /* Working pointer of inputA */ 00509 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00510 px = pSrc1; 00511 00512 /* Working pointer of inputB */ 00513 pSrc2 = pIn2 + (srcBLen - 1u); 00514 py = pSrc2; 00515 00516 /* ------------------- 00517 * Stage3 process 00518 * ------------------*/ 00519 00520 while(blockSize3 > 0) 00521 { 00522 /* Accumulator is made zero for every iteration */ 00523 sum = 0; 00524 00525 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00526 k = count >> 2u; 00527 00528 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00529 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00530 while(k > 0u) 00531 { 00532 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00533 sum = (q31_t) ((((q63_t) sum << 32) + 00534 ((q63_t) * px++ * (*py--))) >> 32); 00535 00536 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00537 sum = (q31_t) ((((q63_t) sum << 32) + 00538 ((q63_t) * px++ * (*py--))) >> 32); 00539 00540 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00541 sum = (q31_t) ((((q63_t) sum << 32) + 00542 ((q63_t) * px++ * (*py--))) >> 32); 00543 00544 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00545 sum = (q31_t) ((((q63_t) sum << 32) + 00546 ((q63_t) * px++ * (*py--))) >> 32); 00547 00548 /* Decrement the loop counter */ 00549 k--; 00550 } 00551 00552 /* If the count is not a multiple of 4, compute any remaining MACs here. 00553 ** No loop unrolling is used. */ 00554 k = count % 0x4u; 00555 00556 while(k > 0u) 00557 { 00558 /* Perform the multiply-accumulates */ 00559 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00560 sum = (q31_t) ((((q63_t) sum << 32) + 00561 ((q63_t) * px++ * (*py--))) >> 32); 00562 00563 /* Decrement the loop counter */ 00564 k--; 00565 } 00566 00567 /* Store the result in the accumulator in the destination buffer. */ 00568 *pOut++ = sum << 1; 00569 00570 /* Update the inputA and inputB pointers for next MAC calculation */ 00571 px = ++pSrc1; 00572 py = pSrc2; 00573 00574 /* Decrement the MAC count */ 00575 count--; 00576 00577 /* Decrement the loop counter */ 00578 blockSize3--; 00579 00580 } 00581 00582 /* set status as ARM_MATH_SUCCESS */ 00583 status = ARM_MATH_SUCCESS; 00584 } 00585 00586 /* Return to application */ 00587 return (status); 00588 00589 } 00590