adding multadd_cpx_vector

a5e6690d · Florian Kaltenberger · ab57b0dd · a5e6690d · a5e6690d
Commit a5e6690d authored 8 years ago by Florian Kaltenberger
--- a/openair1/PHY/TOOLS/cmult_vv.c
+++ b/openair1/PHY/TOOLS/cmult_vv.c
@@ -27,6 +27,7 @@

 #if defined(__x86_64__) || defined(__i386__)
 int16_t conjug[8]__attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ;
+int16_t conjug2[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1} ;
 #define simd_q15_t __m128i
 #define simdshort_q15_t __m64
 #elif defined(__arm__)
@@ -134,3 +135,81 @@ int mult_cpx_conj_vector(int16_t *x1,
  return(0);
 }

+int multadd_cpx_vector(int16_t *x1,
+                    int16_t *x2,
+                    int16_t *y,
+                    uint8_t zero_flag,
+                    uint32_t N,
+                    int output_shift)
+{
+  // Multiply elementwise the complex conjugate of x1 with x2.
+  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+  //            We assume x1 with a dinamic of 15 bit maximum
+  //
+  // x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+  //            We assume x2 with a dinamic of 14 bit maximum
+  ///
+  // y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+  //
+  // zero_flag - Set output (y) to zero prior to disable accumulation
+  //
+  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
+  //
+  // output_shift  - shift to be applied to generate output
+
+  uint32_t i;                 // loop counter
+
+  simd_q15_t *x1_128;
+  simd_q15_t *x2_128;
+  simd_q15_t *y_128;
+#if defined(__x86_64__) || defined(__i386__)
+  simd_q15_t tmp_re,tmp_im;
+  simd_q15_t tmpy0,tmpy1;
+#elif defined(__arm__)
+  int32x4_t tmp_re,tmp_im;
+  int32x4_t tmp_re1,tmp_im1;
+  int16x4x2_t tmpy;
+  int32x4_t shift = vdupq_n_s32(-output_shift);
+#endif
+
+  x1_128 = (simd_q15_t *)&x1[0];
+  x2_128 = (simd_q15_t *)&x2[0];
+  y_128  = (simd_q15_t *)&y[0];
+
+
+  // we compute 4 cpx multiply for each loop
+  for(i=0; i<(N>>2); i++) {
+#if defined(__x86_64__) || defined(__i386__)
+    tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]);
+    tmp_re = _mm_madd_epi16(tmp_re,*x2_128);
+    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
+    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
+    tmp_im = _mm_madd_epi16(tmp_im,*x2_128);
+    tmp_re = _mm_srai_epi32(tmp_re,output_shift);
+    tmp_im = _mm_srai_epi32(tmp_im,output_shift);
+    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im);
+    //print_ints("unpack lo:",&tmpy0[i]);
+    tmpy1  = _mm_unpackhi_epi32(tmp_re,tmp_im);
+    //print_ints("unpack hi:",&tmpy1[i]);
+
+    if (zero_flag == 1)
+      *y_128 = _mm_packs_epi32(tmpy0,tmpy1);
+    else
+      *y_128 = _mm_adds_epi16(*y_128,_mm_packs_epi32(tmpy0,tmpy1));
+    //print_shorts("*y_128:",&y_128[i]);
+
+#elif defined(__arm__)
+
+    msg("mult_cpx_vector not implemented for __arm__");
+#endif
+    x1_128++;
+    x2_128++;
+    y_128++;
+  }
+
+
+  _mm_empty();
+  _m_empty();
+
+  return(0);
+}
--- a/openair1/PHY/TOOLS/defs.h
+++ b/openair1/PHY/TOOLS/defs.h
@@ -126,6 +126,25 @@ int mult_cpx_conj_vector(int16_t *x1,
                         int output_shift,
 			 int madd);

+/*!
+  Element-wise multiplication and accumulation of two complex vectors x1 and x2.
+  @param x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+              We assume x1 with a dinamic of 15 bit maximum
+  @param x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+              We assume x2 with a dinamic of 14 bit maximum
+  @param y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+  @param zero_flag Set output (y) to zero prior to accumulation
+  @param N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
+  @param output_shift  - shift to be applied to generate output
+*/
+
+int multadd_cpx_vector(int16_t *x1,
+                    int16_t *x2,
+                    int16_t *y,
+                    uint8_t zero_flag,
+                    uint32_t N,
+		       int output_shift);
+
 // lte_dfts.c
 void init_fft(uint16_t size,
              uint8_t logsize,