diff --git a/openair1/PHY/TOOLS/cmult_vv.c b/openair1/PHY/TOOLS/cmult_vv.c index 39bfe547e7912da809f3498c35a698accc62fea1..9d5079f52b6ad938e83d9f55762f01f515639a24 100644 --- a/openair1/PHY/TOOLS/cmult_vv.c +++ b/openair1/PHY/TOOLS/cmult_vv.c @@ -27,6 +27,7 @@ #if defined(__x86_64__) || defined(__i386__) int16_t conjug[8]__attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ; +int16_t conjug2[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1} ; #define simd_q15_t __m128i #define simdshort_q15_t __m64 #elif defined(__arm__) @@ -134,3 +135,81 @@ int mult_cpx_conj_vector(int16_t *x1, return(0); } +int multadd_cpx_vector(int16_t *x1, + int16_t *x2, + int16_t *y, + uint8_t zero_flag, + uint32_t N, + int output_shift) +{ + // Multiply elementwise the complex conjugate of x1 with x2. + // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| + // We assume x1 with a dinamic of 15 bit maximum + // + // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| + // We assume x2 with a dinamic of 14 bit maximum + /// + // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| + // + // zero_flag - Set output (y) to zero prior to disable accumulation + // + // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; + // + // output_shift - shift to be applied to generate output + + uint32_t i; // loop counter + + simd_q15_t *x1_128; + simd_q15_t *x2_128; + simd_q15_t *y_128; +#if defined(__x86_64__) || defined(__i386__) + simd_q15_t tmp_re,tmp_im; + simd_q15_t tmpy0,tmpy1; +#elif defined(__arm__) + int32x4_t tmp_re,tmp_im; + int32x4_t tmp_re1,tmp_im1; + int16x4x2_t tmpy; + int32x4_t shift = vdupq_n_s32(-output_shift); +#endif + + x1_128 = (simd_q15_t *)&x1[0]; + x2_128 = (simd_q15_t *)&x2[0]; + y_128 = (simd_q15_t *)&y[0]; + + + // we compute 4 cpx multiply for each loop + for(i=0; i<(N>>2); i++) { +#if defined(__x86_64__) || defined(__i386__) + tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]); + tmp_re = _mm_madd_epi16(tmp_re,*x2_128); + tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); + tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); + tmp_im = _mm_madd_epi16(tmp_im,*x2_128); + tmp_re = _mm_srai_epi32(tmp_re,output_shift); + tmp_im = _mm_srai_epi32(tmp_im,output_shift); + tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); + //print_ints("unpack lo:",&tmpy0[i]); + tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); + //print_ints("unpack hi:",&tmpy1[i]); + + if (zero_flag == 1) + *y_128 = _mm_packs_epi32(tmpy0,tmpy1); + else + *y_128 = _mm_adds_epi16(*y_128,_mm_packs_epi32(tmpy0,tmpy1)); + //print_shorts("*y_128:",&y_128[i]); + +#elif defined(__arm__) + + msg("mult_cpx_vector not implemented for __arm__"); +#endif + x1_128++; + x2_128++; + y_128++; + } + + + _mm_empty(); + _m_empty(); + + return(0); +} diff --git a/openair1/PHY/TOOLS/defs.h b/openair1/PHY/TOOLS/defs.h index 273822c598a26c58686d12cdd8a8e15ff13a58f1..8f0ae402bf62dfbebd7cfa06e2162486366b0e74 100644 --- a/openair1/PHY/TOOLS/defs.h +++ b/openair1/PHY/TOOLS/defs.h @@ -126,6 +126,25 @@ int mult_cpx_conj_vector(int16_t *x1, int output_shift, int madd); +/*! + Element-wise multiplication and accumulation of two complex vectors x1 and x2. + @param x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| + We assume x1 with a dinamic of 15 bit maximum + @param x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| + We assume x2 with a dinamic of 14 bit maximum + @param y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| + @param zero_flag Set output (y) to zero prior to accumulation + @param N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; + @param output_shift - shift to be applied to generate output +*/ + +int multadd_cpx_vector(int16_t *x1, + int16_t *x2, + int16_t *y, + uint8_t zero_flag, + uint32_t N, + int output_shift); + // lte_dfts.c void init_fft(uint16_t size, uint8_t logsize,