00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_mat_mult_q15.c 00009 * 00010 * Description: Q15 matrix multiplication. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * 00029 * Version 0.0.5 2010/04/26 00030 * incorporated review comments and updated with latest CMSIS layer 00031 * 00032 * Version 0.0.3 2010/03/10 00033 * Initial version 00034 * -------------------------------------------------------------------- */ 00035 00036 #include "arm_math.h" 00037 00074 arm_status arm_mat_mult_q15( 00075 const arm_matrix_instance_q15 * pSrcA, 00076 const arm_matrix_instance_q15 * pSrcB, 00077 arm_matrix_instance_q15 * pDst, 00078 q15_t * pState) 00079 { 00080 q63_t sum; /* accumulator */ 00081 00082 #ifndef ARM_MATH_CM0 00083 00084 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00085 00086 q31_t in; /* Temporary variable to hold the input value */ 00087 q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */ 00088 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00089 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00090 q15_t *px; /* Temporary output data matrix pointer */ 00091 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00092 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00093 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00094 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00095 uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */ 00096 arm_status status; /* status of matrix multiplication */ 00097 00098 #ifdef ARM_MATH_MATRIX_CHECK 00099 00100 00101 /* Check for matrix mismatch condition */ 00102 00103 if((pSrcA->numCols != pSrcB->numRows) || 00104 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00105 { 00106 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00107 status = ARM_MATH_SIZE_MISMATCH; 00108 } 00109 else 00110 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00111 00112 { 00113 /* Matrix transpose */ 00114 do 00115 { 00116 /* Apply loop unrolling and exchange the columns with row elements */ 00117 col = numColsB >> 2; 00118 00119 /* The pointer px is set to starting address of the column being processed */ 00120 px = pSrcBT + i; 00121 00122 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00123 ** a second loop below computes the remaining 1 to 3 samples. */ 00124 while(col > 0u) 00125 { 00126 /* Read two elements from the row */ 00127 in = *__SIMD32(pInB)++; 00128 00129 /* Unpack and store one element in the destination */ 00130 #ifndef ARM_MATH_BIG_ENDIAN 00131 00132 *px = (q15_t) in; 00133 00134 #else 00135 00136 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00137 00138 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00139 00140 /* Update the pointer px to point to the next row of the transposed matrix */ 00141 px += numRowsB; 00142 00143 /* Unpack and store the second element in the destination */ 00144 #ifndef ARM_MATH_BIG_ENDIAN 00145 00146 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00147 00148 #else 00149 00150 *px = (q15_t) in; 00151 00152 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00153 00154 00155 /* Update the pointer px to point to the next row of the transposed matrix */ 00156 px += numRowsB; 00157 00158 /* Read two elements from the row */ 00159 in = *__SIMD32(pInB)++; 00160 00161 /* Unpack and store one element in the destination */ 00162 #ifndef ARM_MATH_BIG_ENDIAN 00163 00164 *px = (q15_t) in; 00165 00166 #else 00167 00168 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00169 00170 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00171 00172 /* Update the pointer px to point to the next row of the transposed matrix */ 00173 px += numRowsB; 00174 00175 /* Unpack and store the second element in the destination */ 00176 00177 #ifndef ARM_MATH_BIG_ENDIAN 00178 00179 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00180 00181 #else 00182 00183 *px = (q15_t) in; 00184 00185 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00186 00187 /* Update the pointer px to point to the next row of the transposed matrix */ 00188 px += numRowsB; 00189 00190 /* Decrement the column loop counter */ 00191 col--; 00192 } 00193 00194 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00195 ** No loop unrolling is used. */ 00196 col = numColsB % 0x4u; 00197 00198 while(col > 0u) 00199 { 00200 /* Read and store the input element in the destination */ 00201 *px = *pInB++; 00202 00203 /* Update the pointer px to point to the next row of the transposed matrix */ 00204 px += numRowsB; 00205 00206 /* Decrement the column loop counter */ 00207 col--; 00208 } 00209 00210 i++; 00211 00212 /* Decrement the row loop counter */ 00213 row--; 00214 00215 } while(row > 0u); 00216 00217 /* Reset the variables for the usage in the following multiplication process */ 00218 row = numRowsA; 00219 i = 0u; 00220 px = pDst->pData; 00221 00222 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00223 /* row loop */ 00224 do 00225 { 00226 /* For every row wise process, the column loop counter is to be initiated */ 00227 col = numColsB; 00228 00229 /* For every row wise process, the pIn2 pointer is set 00230 ** to the starting address of the transposed pSrcB data */ 00231 pInB = pSrcBT; 00232 00233 /* column loop */ 00234 do 00235 { 00236 /* Set the variable sum, that acts as accumulator, to zero */ 00237 sum = 0; 00238 00239 /* Apply loop unrolling and compute 2 MACs simultaneously. */ 00240 colCnt = numColsA >> 1; 00241 00242 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00243 pInA = pSrcA->pData + i; 00244 00245 /* matrix multiplication */ 00246 while(colCnt > 0u) 00247 { 00248 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00249 sum = __SMLALD(*__SIMD32(pInA)++, *__SIMD32(pInB)++, sum); 00250 00251 /* Decrement the loop counter */ 00252 colCnt--; 00253 } 00254 00255 /* process odd column samples */ 00256 if((numColsA & 0x1u) > 0u) 00257 { 00258 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00259 sum += ((q31_t) * pInA * (*pInB++)); 00260 } 00261 00262 /* Saturate and store the result in the destination buffer */ 00263 *px = (q15_t) (__SSAT((sum >> 15), 16)); 00264 px++; 00265 00266 /* Decrement the column loop counter */ 00267 col--; 00268 00269 } while(col > 0u); 00270 00271 i = i + numColsA; 00272 00273 /* Decrement the row loop counter */ 00274 row--; 00275 00276 } while(row > 0u); 00277 00278 #else 00279 00280 /* Run the below code for Cortex-M0 */ 00281 00282 q15_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */ 00283 q15_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */ 00284 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00285 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00286 q15_t *pOut = pDst->pData; /* output data matrix pointer */ 00287 q15_t *px; /* Temporary output data matrix pointer */ 00288 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00289 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00290 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00291 uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */ 00292 arm_status status; /* status of matrix multiplication */ 00293 00294 #ifdef ARM_MATH_MATRIX_CHECK 00295 00296 /* Check for matrix mismatch condition */ 00297 if((pSrcA->numCols != pSrcB->numRows) || 00298 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00299 { 00300 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00301 status = ARM_MATH_SIZE_MISMATCH; 00302 } 00303 else 00304 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00305 00306 { 00307 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00308 /* row loop */ 00309 do 00310 { 00311 /* Output pointer is set to starting address of the row being processed */ 00312 px = pOut + i; 00313 00314 /* For every row wise process, the column loop counter is to be initiated */ 00315 col = numColsB; 00316 00317 /* For every row wise process, the pIn2 pointer is set 00318 ** to the starting address of the pSrcB data */ 00319 pIn2 = pSrcB->pData; 00320 00321 /* column loop */ 00322 do 00323 { 00324 /* Set the variable sum, that acts as accumulator, to zero */ 00325 sum = 0; 00326 00327 /* Initiate the pointer pIn1 to point to the starting address of pSrcA */ 00328 pIn1 = pInA; 00329 00330 /* Matrix A columns number of MAC operations are to be performed */ 00331 colCnt = numColsA; 00332 00333 /* matrix multiplication */ 00334 while(colCnt > 0u) 00335 { 00336 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00337 /* Perform the multiply-accumulates */ 00338 sum += (q31_t) * pIn1++ * *pIn2; 00339 pIn2 += numColsB; 00340 00341 /* Decrement the loop counter */ 00342 colCnt--; 00343 } 00344 00345 /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */ 00346 /* Saturate and store the result in the destination buffer */ 00347 *px++ = (q15_t) __SSAT((sum >> 15), 16); 00348 00349 /* Decrement the column loop counter */ 00350 col--; 00351 00352 /* Update the pointer pIn2 to point to the starting address of the next column */ 00353 pIn2 = pInB + (numColsB - col); 00354 00355 } while(col > 0u); 00356 00357 /* Update the pointer pSrcA to point to the starting address of the next row */ 00358 i = i + numColsB; 00359 pInA = pInA + numColsA; 00360 00361 /* Decrement the row loop counter */ 00362 row--; 00363 00364 } while(row > 0u); 00365 00366 #endif /* #ifndef ARM_MATH_CM0 */ 00367 00368 /* set status as ARM_MATH_SUCCESS */ 00369 status = ARM_MATH_SUCCESS; 00370 } 00371 00372 /* Return to application */ 00373 return (status); 00374 } 00375