diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt index 8fc2f4fd096bada25154ef2fe4c5928358431e8f..d162337ee864ae6d5b80bb30fc00ee37fd0198a3 100644 --- a/cmake_targets/CMakeLists.txt +++ b/cmake_targets/CMakeLists.txt @@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2") endif() if (CPUINFO MATCHES "sse4_2") - set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2") + set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2 -fno-tree-vectorize") endif() if (CPUINFO MATCHES "sse4_1") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1") diff --git a/openair1/PHY/CODING/3gpplte_sse.c b/openair1/PHY/CODING/3gpplte_sse.c index e486c8e0b4c733b3f39a2acbbc8a94a073f623e4..ac4a1ae3536cb61491f068499902102c72e2d13c 100755 --- a/openair1/PHY/CODING/3gpplte_sse.c +++ b/openair1/PHY/CODING/3gpplte_sse.c @@ -223,6 +223,7 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns 0b00000001}; #endif + #ifndef __AVX2__ if ((n&15) > 0) loop++; diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c index 11720087ee81e6d01c4c90c4509925201b5e6c7d..75e8eaf126cdae33f7f6700d67dd2aca9c63914f 100644 --- a/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c +++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c @@ -186,12 +186,16 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16 __m256i new0,new1,new2,new3,new4,new5,new6,new7; __m256i alpha_max; + unsigned long long timein,timeout; + l2 = L>>3; K1 = (frame_length>>3); #ifdef DEBUG_LOGMAP fprintf(fdavx2,"Compute alpha (avx2_16bit)\n"); fprintf(fdavx2b,"Compute alpha (avx2_16bit)\n"); #endif + timein = rdtsc_oai(); + for (l=K1;; l=l2,rerun_flag=1) { alpha128 = (__m256i *)alpha; @@ -378,6 +382,9 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16 if (rerun_flag==1) break; } + timeout = rdtsc_oai(); + printf("alpha: inner loop time %llu\n",timeout-timein); + } @@ -386,9 +393,10 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_ int k,rerun_flag=0; - __m256i m11_128,m10_128; - __m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; - __m256i new0,new1,new2,new3,new4,new5,new6,new7; + __m256i *m11p,*m10p; + register __m256i b0,b1,b2,b3,b4,b5,b6,b7; + register __m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; + register __m256i new0,new1,new2,new3,new4,new5,new6,new7; __m256i *beta128,*alpha128,*beta_ptr; __m256i beta_max; @@ -398,6 +406,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_ llr_t beta0,beta1; llr_t beta0_cw2,beta1_cw2; + unsigned long long timein,timeout; + #ifdef DEBUG_LOGMAP fprintf(fdavx2,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n", beta,m_11,m_10,alpha,frame_length,F); @@ -590,56 +600,74 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_ #endif int loopval=((rerun_flag==0)?0:((frame_length-L)>>3)); + printf("beta: rerun %d => loopval %d\n",rerun_flag,loopval); + + timein = rdtsc_oai(); + + m11p = (frame_length>>3)-1+(__m256i*)m_11; + m10p = (frame_length>>3)-1+(__m256i*)m_10; + for (k=(frame_length>>3)-1; k>=loopval; k--) { - m11_128=((__m256i*)m_11)[k]; - m10_128=((__m256i*)m_10)[k]; - - m_b0 = _mm256_adds_epi16(beta_ptr[4],m11_128); //m11 - m_b1 = _mm256_subs_epi16(beta_ptr[4],m11_128); //m00 - m_b2 = _mm256_subs_epi16(beta_ptr[5],m10_128); //m01 - m_b3 = _mm256_adds_epi16(beta_ptr[5],m10_128); //m10 - m_b4 = _mm256_adds_epi16(beta_ptr[6],m10_128); //m10 - m_b5 = _mm256_subs_epi16(beta_ptr[6],m10_128); //m01 - m_b6 = _mm256_subs_epi16(beta_ptr[7],m11_128); //m00 - m_b7 = _mm256_adds_epi16(beta_ptr[7],m11_128); //m11 - - new0 = _mm256_subs_epi16(beta_ptr[0],m11_128); //m00 - new1 = _mm256_adds_epi16(beta_ptr[0],m11_128); //m11 - new2 = _mm256_adds_epi16(beta_ptr[1],m10_128); //m10 - new3 = _mm256_subs_epi16(beta_ptr[1],m10_128); //m01 - new4 = _mm256_subs_epi16(beta_ptr[2],m10_128); //m01 - new5 = _mm256_adds_epi16(beta_ptr[2],m10_128); //m10 - new6 = _mm256_adds_epi16(beta_ptr[3],m11_128); //m11 - new7 = _mm256_subs_epi16(beta_ptr[3],m11_128); //m00 + + b4 = _mm256_load_si256(&beta_ptr[4]); + b5 = _mm256_load_si256(&beta_ptr[5]); + b6 = _mm256_load_si256(&beta_ptr[6]); + b7 = _mm256_load_si256(&beta_ptr[7]); + + m_b0 = _mm256_adds_epi16(b4,*m11p); //m11 + m_b1 = _mm256_subs_epi16(b4,*m11p); //m00 + m_b2 = _mm256_subs_epi16(b5,*m10p); //m01 + m_b3 = _mm256_adds_epi16(b5,*m10p); //m10 + m_b4 = _mm256_adds_epi16(b6,*m10p); //m10 + m_b5 = _mm256_subs_epi16(b6,*m10p); //m01 + m_b6 = _mm256_subs_epi16(b7,*m11p); //m00 + m_b7 = _mm256_adds_epi16(b7,*m11p); //m11 + + b0 = _mm256_load_si256(&beta_ptr[0]); + b1 = _mm256_load_si256(&beta_ptr[1]); + b2 = _mm256_load_si256(&beta_ptr[2]); + b3 = _mm256_load_si256(&beta_ptr[3]); + + new0 = _mm256_subs_epi16(b0,*m11p); //m00 + new1 = _mm256_adds_epi16(b0,*m11p); //m11 + new2 = _mm256_adds_epi16(b1,*m10p); //m10 + new3 = _mm256_subs_epi16(b1,*m10p); //m01 + new4 = _mm256_subs_epi16(b2,*m10p); //m01 + new5 = _mm256_adds_epi16(b2,*m10p); //m10 + new6 = _mm256_adds_epi16(b3,*m11p); //m11 + new7 = _mm256_subs_epi16(b3,*m11p); //m00 + + + b0 = _mm256_max_epi16(m_b0,new0); + b1 = _mm256_max_epi16(m_b1,new1); + b2 = _mm256_max_epi16(m_b2,new2); + b3 = _mm256_max_epi16(m_b3,new3); + b4 = _mm256_max_epi16(m_b4,new4); + b5 = _mm256_max_epi16(m_b5,new5); + b6 = _mm256_max_epi16(m_b6,new6); + b7 = _mm256_max_epi16(m_b7,new7); + + beta_max = _mm256_max_epi16(b0,b1); + beta_max = _mm256_max_epi16(beta_max ,b2); + beta_max = _mm256_max_epi16(beta_max ,b3); + beta_max = _mm256_max_epi16(beta_max ,b4); + beta_max = _mm256_max_epi16(beta_max ,b5); + beta_max = _mm256_max_epi16(beta_max ,b6); + beta_max = _mm256_max_epi16(beta_max ,b7); beta_ptr-=8; - - beta_ptr[0] = _mm256_max_epi16(m_b0,new0); - beta_ptr[1] = _mm256_max_epi16(m_b1,new1); - beta_ptr[2] = _mm256_max_epi16(m_b2,new2); - beta_ptr[3] = _mm256_max_epi16(m_b3,new3); - beta_ptr[4] = _mm256_max_epi16(m_b4,new4); - beta_ptr[5] = _mm256_max_epi16(m_b5,new5); - beta_ptr[6] = _mm256_max_epi16(m_b6,new6); - beta_ptr[7] = _mm256_max_epi16(m_b7,new7); - - beta_max = _mm256_max_epi16(beta_ptr[0],beta_ptr[1]); - beta_max = _mm256_max_epi16(beta_max ,beta_ptr[2]); - beta_max = _mm256_max_epi16(beta_max ,beta_ptr[3]); - beta_max = _mm256_max_epi16(beta_max ,beta_ptr[4]); - beta_max = _mm256_max_epi16(beta_max ,beta_ptr[5]); - beta_max = _mm256_max_epi16(beta_max ,beta_ptr[6]); - beta_max = _mm256_max_epi16(beta_max ,beta_ptr[7]); - - beta_ptr[0] = _mm256_subs_epi16(beta_ptr[0],beta_max); - beta_ptr[1] = _mm256_subs_epi16(beta_ptr[1],beta_max); - beta_ptr[2] = _mm256_subs_epi16(beta_ptr[2],beta_max); - beta_ptr[3] = _mm256_subs_epi16(beta_ptr[3],beta_max); - beta_ptr[4] = _mm256_subs_epi16(beta_ptr[4],beta_max); - beta_ptr[5] = _mm256_subs_epi16(beta_ptr[5],beta_max); - beta_ptr[6] = _mm256_subs_epi16(beta_ptr[6],beta_max); - beta_ptr[7] = _mm256_subs_epi16(beta_ptr[7],beta_max); + m11p--; + m10p--; + + beta_ptr[0] = _mm256_subs_epi16(b0,beta_max); + beta_ptr[1] = _mm256_subs_epi16(b1,beta_max); + beta_ptr[2] = _mm256_subs_epi16(b2,beta_max); + beta_ptr[3] = _mm256_subs_epi16(b3,beta_max); + beta_ptr[4] = _mm256_subs_epi16(b4,beta_max); + beta_ptr[5] = _mm256_subs_epi16(b5,beta_max); + beta_ptr[6] = _mm256_subs_epi16(b6,beta_max); + beta_ptr[7] = _mm256_subs_epi16(b7,beta_max); #ifdef DEBUG_LOGMAP fprintf(fdavx2,"Loop index %d, mb\n",k); @@ -658,6 +686,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_ #endif } + timeout = rdtsc_oai(); + printf("beta: inner loop time %llu\n",timeout-timein); if (rerun_flag==1) break; @@ -968,7 +998,7 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y, yp2 = yparity2; - +#if 0 for (i=0; i<n; i+=8) { pi2_p = &pi2tab16avx2[iind][i]; @@ -1084,9 +1114,23 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y, yp128_cw2+=3; } - yp=(llr_t*)yp128; yp_cw2=(llr_t*)yp128_cw2; +#else + + pi2_p = &pi2tab16avx2[iind][0]; + for (i=0,j=0; i<n; i++) { + s[*pi2_p] = y[j]; + s[*pi2_p+8] = y2[j++]; + yp1[*pi2_p] = y[j]; + yp1[*pi2_p+8] = y2[j++]; + yp2[*pi2_p] = y[j]; + yp2[(*pi2_p++)+8] = y2[j++]; + } + yp=(llr_t*)&y[j]; + yp_cw2=(llr_t*)&y2[j]; +#endif + // Termination for (i=0; i<3; i++) { diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c index b8abeda3ccb9ef08631a1edb28401d2312e49537..b70b1aee9550bef5559418bbfca2c734985124a0 100644 --- a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c +++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c @@ -144,12 +144,25 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity fprintf(fdsse4,"compute_gamma (sse_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length); #endif +#ifndef __AVX2__ K1=frame_length>>3; +#else + if ((frame_length&15) > 0) + K1=(frame_length+1)>>4; + else + K1=frame_length>>4; +#endif for (k=0; k<K1; k++) { #if defined(__x86_64__) || defined(__i386__) +#ifndef __AVX2__ m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k],y_parity128[k]),1); m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k],y_parity128[k]),1); +#else + ((__m256i*)m11_128)[k] = _mm256_srai_epi16(_mm256_adds_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1); + // ((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)y_parity128)[k],((__m256i*)systematic128)[k]),1); + ((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1); +#endif #elif defined(__arm__) m11_128[k] = vhaddq_s16(systematic128[k],y_parity128[k]); m10_128[k] = vhsubq_s16(systematic128[k],y_parity128[k]); @@ -164,13 +177,19 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity #endif } + k=frame_length>>3; // Termination #if defined(__x86_64__) || defined(__i386__) m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1); + //#ifndef __AVX2__ +#if 1 m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1); +#else + m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(y_parity128[k],systematic128[k+term_flag]),1); +#endif #elif defined(__arm__) m11_128[k] = vhaddq_s16(systematic128[k+term_flag],y_parity128[k]); -m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]); + m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]); #endif #ifdef DEBUG_LOGMAP @@ -188,11 +207,21 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s { int k,l,l2,K1,rerun_flag=0; #if defined(__x86_64__) || defined(__i386__) - __m128i *alpha128=(__m128i *)alpha,*alpha_ptr; - __m128i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p; + __m128i *alpha128=(__m128i *)alpha,*alpha_ptr,*m11p,*m10p; + //#ifndef __AVX2__ +#if 1 + __m128i a0,a1,a2,a3,a4,a5,a6,a7; __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; __m128i new0,new1,new2,new3,new4,new5,new6,new7; __m128i alpha_max; +#else + __m256i *alpha256=(__m256i *)alpha,*alpha_ptr256,m11,m10; + __m256i a01,a23,a45,a67,a02,a13,a64,a75; + __m256i m_b01,m_b23,m_b45,m_b67,new01,new23,new45,new67; + __m256i m11m10_256; + __m256i alpha_max; +#endif + #elif defined(__arm__) int16x8_t *alpha128=(int16x8_t *)alpha,*alpha_ptr; int16x8_t a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p; @@ -208,6 +237,10 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s for (l=K1;; l=l2,rerun_flag=1) { #if defined(__x86_64__) || defined(__i386__) alpha128 = (__m128i *)alpha; + //#ifdef __AVX2__ +#if 0 + alpha256 = (__m256i *)alpha; +#endif #elif defined(__arm__) alpha128 = (int16x8_t *)alpha; #endif @@ -288,6 +321,11 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s } alpha_ptr = &alpha128[0]; + //#ifdef __AVX2__ +#if 0 + alpha_ptr256 = &alpha256[0]; +#endif + #if defined(__x86_64__) || defined(__i386__) m11p = (__m128i*)m_11; m10p = (__m128i*)m_10; @@ -300,6 +338,8 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s k++) { #if defined(__x86_64__) || defined(__i386__) + //#ifndef __AVX2__ +#if 1 a1=_mm_load_si128(&alpha_ptr[1]); a3=_mm_load_si128(&alpha_ptr[3]); a5=_mm_load_si128(&alpha_ptr[5]); @@ -344,6 +384,37 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s alpha_max = _mm_max_epi16(alpha_max,a5); alpha_max = _mm_max_epi16(alpha_max,a6); alpha_max = _mm_max_epi16(alpha_max,a7); +#else + a02=_mm256_load_si256(&alpha_ptr256[0]); + a13=_mm256_load_si256(&alpha_ptr256[1]); + a64=_mm256_load_si256(&alpha_ptr256[2]); + a75=_mm256_load_si256(&alpha_ptr256[3]); + m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m11p,0); + m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m10p,1); + + + m_b01 = _mm256_adds_epi16(a13,m11m10_256); //negative m10 + m_b23 = _mm256_subs_epi16(a75,m11m10_256); //negative m10 + m_b45 = _mm256_subs_epi16(a13,m11m10_256); //negative m10 + m_b67 = _mm256_adds_epi16(a75,m11m10_256); //negative m10 + + new01 = _mm256_subs_epi16(a02,m11m10_256); //negative m10 + new23 = _mm256_adds_epi16(a64,m11m10_256); //negative m10 + new45 = _mm256_adds_epi16(a02,m11m10_256); //negative m10 + new67 = _mm256_subs_epi16(a64,m11m10_256); //negative m10 + + a01 = _mm256_max_epi16(m_b01,new01); + a23 = _mm256_max_epi16(m_b23,new23); + a45 = _mm256_max_epi16(m_b45,new45); + a67 = _mm256_max_epi16(m_b67,new67); + + alpha_max = _mm256_max_epi16(a01,a23); + alpha_max = _mm256_max_epi16(alpha_max,a45); + alpha_max = _mm256_max_epi16(alpha_max,a67); + alpha_max = _mm256_max_epi16(alpha_max,_mm256_permutevar8x32_epi32(alpha_max,_mm256_set_epi32(3,2,1,0,7,6,5,4))); + + +#endif #elif defined(__arm__) m_b0 = vqaddq_s16(alpha_ptr[1],*m11p); // m11 m_b4 = vqsubq_s16(alpha_ptr[1],*m11p); // m00=-m11 @@ -383,9 +454,15 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s #endif alpha_ptr+=8; + //#ifdef __AVX2__ +#if 0 + alpha_ptr256+=4; +#endif m11p++; m10p++; #if defined(__x86_64__) || defined(__i386__) + //#ifndef __AVX2__ +#if 1 alpha_ptr[0] = _mm_subs_epi16(a0,alpha_max); alpha_ptr[1] = _mm_subs_epi16(a1,alpha_max); alpha_ptr[2] = _mm_subs_epi16(a2,alpha_max); @@ -394,6 +471,18 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s alpha_ptr[5] = _mm_subs_epi16(a5,alpha_max); alpha_ptr[6] = _mm_subs_epi16(a6,alpha_max); alpha_ptr[7] = _mm_subs_epi16(a7,alpha_max); +#else + + a01 = _mm256_subs_epi16(a01,alpha_max); + a23 = _mm256_subs_epi16(a23,alpha_max); + a45 = _mm256_subs_epi16(a45,alpha_max); + a67 = _mm256_subs_epi16(a67,alpha_max); + + alpha_ptr256[0] = _mm256_permute2x128_si256(a01,a23,0x20); //a02 + alpha_ptr256[1] = _mm256_permute2x128_si256(a01,a23,0x13); //a13 + alpha_ptr256[2] = _mm256_permute2x128_si256(a45,a67,0x02); //a64 + alpha_ptr256[3] = _mm256_permute2x128_si256(a45,a67,0x31); //a75 +#endif #elif defined(__arm__) alpha_ptr[0] = vqsubq_s16(a0,alpha_max); alpha_ptr[1] = vqsubq_s16(a1,alpha_max); @@ -488,8 +577,12 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh // fprintf(fdsse4,"beta init: offset8 %d\n",offset8_flag); m11=(int16_t)m_11[2+frame_length]; + //#ifndef __AVX2__ +#if 1 m10=(int16_t)m_10[2+frame_length]; - +#else + m10=-(int16_t)m_10[2+frame_length]; +#endif #ifdef DEBUG_LOGMAP fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10); #endif @@ -643,6 +736,9 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh m11_128=((__m128i*)m_11)[k]; m10_128=((__m128i*)m_10)[k]; + + //#ifndef __AVX2__ +#if 1 m_b0 = _mm_adds_epi16(beta_ptr[4],m11_128); //m11 m_b1 = _mm_subs_epi16(beta_ptr[4],m11_128); //m00 m_b2 = _mm_subs_epi16(beta_ptr[5],m10_128); //m01 @@ -652,6 +748,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh m_b6 = _mm_subs_epi16(beta_ptr[7],m11_128); //m00 m_b7 = _mm_adds_epi16(beta_ptr[7],m11_128); //m11 + new0 = _mm_subs_epi16(beta_ptr[0],m11_128); //m00 new1 = _mm_adds_epi16(beta_ptr[0],m11_128); //m11 new2 = _mm_adds_epi16(beta_ptr[1],m10_128); //m10 @@ -661,8 +758,29 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh new6 = _mm_adds_epi16(beta_ptr[3],m11_128); //m11 new7 = _mm_subs_epi16(beta_ptr[3],m11_128); //m00 +#else + b01=_mm256_load_si256(&((_m256i*)beta_ptr)[0]); + b23=_mm256_load_si256(&((_m256i*)beta_ptr)[1]); + b45=_mm256_load_si256(&((_m256i*)beta_ptr)[2]); + b67=_mm256_load_si256(&((_m256i*)beta_ptr)[3]); + m11m10_256 = _mm256_insertf128_si256(m11m10_256,m11_128,0); + m11m10_256 = _mm256_insertf128_si256(m11m10_256,m10_128,1); + + + m_b02 = _mm256_adds_epi16(b45,m11m10_256); //negative m10 + m_b13 = _mm256_subs_epi16(b45,m11m10_256); //negative m10 + m_b64 = _mm256_subs_epi16(b67,m11m10_256); //negative m10 + m_b75 = _mm256_adds_epi16(b67,m11m10_256); //negative m10 + new02 = _mm256_subs_epi16(b01,m11m10_256); //negative m10 + new13 = _mm256_adds_epi16(b01,m11m10_256); //negative m10 + new64 = _mm256_adds_epi16(b23,m11m10_256); //negative m10 + new75 = _mm256_subs_epi16(b24,m11m10_256); //negative m10 +#endif + beta_ptr-=8; + //#ifndef __AVX2__ +#if 1 beta_ptr[0] = _mm_max_epi16(m_b0,new0); beta_ptr[1] = _mm_max_epi16(m_b1,new1); beta_ptr[2] = _mm_max_epi16(m_b2,new2); @@ -688,6 +806,28 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh beta_ptr[5] = _mm_subs_epi16(beta_ptr[5],beta_max); beta_ptr[6] = _mm_subs_epi16(beta_ptr[6],beta_max); beta_ptr[7] = _mm_subs_epi16(beta_ptr[7],beta_max); +#else + b02 = _mm256_max_epi16(m_b02,new02); + b13 = _mm256_max_epi16(m_b13,new13); + b64 = _mm256_max_epi16(m_b64,new64); + b75 = _mm256_max_epi16(m_b75,new75); + + beta_max = _mm256_max_epi16(b02,b13); + beta_max = _mm256_max_epi16(beta_max,b64); + beta_max = _mm256_max_epi16(beta_max,b75); + beta_max = _mm256_max_epi16(beta_max,_mm256_permutevar8x32_epi32(betaa_max,_mm256_set_epi32(3,2,1,0,7,6,5,4))); + + b02 = _mm256_subs_epi16(b02,beta_max); + b13 = _mm256_subs_epi16(b13,beta_max); + b64 = _mm256_subs_epi16(b64,beta_max); + b75 = _mm256_subs_epi16(b75,beta_max); + + ((_m256i*)beta_ptr)[0]) = _mm256_permute2x128_si256(b02,b13,0x02); //b01 + ((_m256i*)beta_ptr)[1]) = _mm256_permute2x128_si256(b02,b13,0x31); //b23 + ((_m256i*)beta_ptr)[2]) = _mm256_permute2x128_si256(b64,b75,0x13); //b45 + ((_m256i*)beta_ptr)[3]) = _mm256_permute2x128_si256(b64,b75,0x20); //b67 +#endif + #elif defined(__arm__) m11_128=((int16x8_t*)m_11)[k]; m10_128=((int16x8_t*)m_10)[k]; @@ -820,6 +960,9 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, print_shorts("b6:",&beta_ptr[6]); print_shorts("b7:",&beta_ptr[7]); */ + + //#ifndef __AVX2__ +#if 1 m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00; m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11; m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00; @@ -836,6 +979,32 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10; m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10; m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01; +#else + + + m00_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00; + m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10; + m11_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11; + m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01; + + m11_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11; + m01_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01; + m00_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00; + m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10; + + m11_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11; + m01_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01; + m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00; + m10_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10; + + m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00; + m10_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10; + m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11; + m01_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01; + + +#endif + /* print_shorts("m11_1:",&m11_1); print_shorts("m11_2:",&m11_2); @@ -1030,19 +1199,19 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, n is the size in bits of the coded block, with the tail */ - llr_t systematic0[n+16] __attribute__ ((aligned(16))); - llr_t systematic1[n+16] __attribute__ ((aligned(16))); - llr_t systematic2[n+16] __attribute__ ((aligned(16))); - llr_t yparity1[n+16] __attribute__ ((aligned(16))); - llr_t yparity2[n+16] __attribute__ ((aligned(16))); + llr_t systematic0[n+16] __attribute__ ((aligned(32))); + llr_t systematic1[n+16] __attribute__ ((aligned(32))); + llr_t systematic2[n+16] __attribute__ ((aligned(32))); + llr_t yparity1[n+16] __attribute__ ((aligned(32))); + llr_t yparity2[n+16] __attribute__ ((aligned(32))); - llr_t ext[n+128] __attribute__((aligned(16))); - llr_t ext2[n+128] __attribute__((aligned(16))); + llr_t ext[n+128] __attribute__((aligned(32))); + llr_t ext2[n+128] __attribute__((aligned(32))); - llr_t alpha[(n+16)*8] __attribute__ ((aligned(16))); - llr_t beta[(n+16)*8] __attribute__ ((aligned(16))); - llr_t m11[n+16] __attribute__ ((aligned(16))); - llr_t m10[n+16] __attribute__ ((aligned(16))); + llr_t alpha[(n+16)*8] __attribute__ ((aligned(32))); + llr_t beta[(n+16)*8] __attribute__ ((aligned(32))); + llr_t m11[n+32] __attribute__ ((aligned(32))); + llr_t m10[n+32] __attribute__ ((aligned(32))); int *pi2_p,*pi4_p,*pi5_p,*pi6_p; diff --git a/openair1/PHY/LTE_REFSIG/lte_gold.c b/openair1/PHY/LTE_REFSIG/lte_gold.c index 567851a51a6cd376f2cd6841782dd7fb374b402d..bfd67166408a9e3a1e777cd30ffcbd6118812a9b 100644 --- a/openair1/PHY/LTE_REFSIG/lte_gold.c +++ b/openair1/PHY/LTE_REFSIG/lte_gold.c @@ -61,21 +61,18 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14] x2 = Ncp + (Nid_cell<<1) + - (((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit + (((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit //x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit //n = 0 - // printf("cinit (ns %d, l %d) => %d\n",ns,l,x2); x1 = 1+ (1<<31); x2=x2 ^ ((x2 ^ (x2>>1) ^ (x2>>2) ^ (x2>>3))<<31); // skip first 50 double words (1600 bits) - //printf("n=0 : x1 %x, x2 %x\n",x1,x2); for (n=1; n<50; n++) { x1 = (x1>>1) ^ (x1>>4); x1 = x1 ^ (x1<<31) ^ (x1<<28); x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4); x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28); - // printf("x1 : %x, x2 : %x\n",x1,x2); } for (n=0; n<14; n++) { @@ -84,7 +81,6 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14] x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4); x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28); lte_gold_table[ns][l][n] = x1^x2; - // printf("n=%d : c %x\n",n,x1^x2); } } diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c b/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c index 66df9dbb78204837036b3f15d3ad8c771fea612c..801c1a4f93ce3c93657c7e742d88dcc7fb512208 100644 --- a/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c +++ b/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c @@ -446,7 +446,8 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, printf("\n"); */ -#ifndef __AVX2__ + //#ifndef __AVX2__ +#if 1 if (err_flag == 0) { start_meas(dlsch_turbo_decoding_stats); diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c index ad464b16babd2b06c64035f0a40379bf73da58fe..264a3c3610c08e9a9438a271ca3e5da225f1fc82 100644 --- a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c +++ b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c @@ -1898,17 +1898,17 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { - dl_ch0_128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; - dl_ch1_128 = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch0_128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; // hr,0 + dl_ch1_128 = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; // hr,1 dl_ch_mag0_128 = (__m128i *)&dl_ch_mag0[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag0_128b = (__m128i *)&dl_ch_magb0[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag1_128 = (__m128i *)&dl_ch_mag1[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag1_128b = (__m128i *)&dl_ch_magb1[aarx][symbol*frame_parms->N_RB_DL*12]; - rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; - rxdataF_comp0_128 = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12]; - rxdataF_comp1_128 = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12]; + rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; // yr + rxdataF_comp0_128 = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12]; // yr,0 = yr * conj(hr,0) + rxdataF_comp1_128 = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12]; // yr,1 = yr * conj(hr,1) for (rb=0; rb<nb_rb; rb++) {