From 8c5e8126df86d625a3ec448542a24380e30c9b7d Mon Sep 17 00:00:00 2001 From: Florian Kaltenberger <florian.kaltenberger@eurecom.fr> Date: Mon, 8 Jun 2015 12:45:16 +0000 Subject: [PATCH] Added support for ARM NEON, lots of changes in openair1 and some in cmake_targets git-svn-id: http://svn.eurecom.fr/openair4G/trunk@7543 818b1a75-f10b-46b9-bf7c-635c3b92a50f --- cmake_targets/CMakeLists.txt | 9 +- cmake_targets/lte-simulators/CMakeLists.txt | 1 + openair1/PHY/CODING/3gpplte.c | 5 +- openair1/PHY/CODING/3gpplte_sse.c | 880 +++--- .../CODING/3gpplte_turbo_decoder_sse_16bit.c | 497 +++- .../CODING/3gpplte_turbo_decoder_sse_8bit.c | 594 +++- openair1/PHY/CODING/Makefile | 24 +- openair1/PHY/CODING/ccoding_byte_lte.c | 35 +- openair1/PHY/CODING/defs.h | 4 +- openair1/PHY/CODING/viterbi.c | 220 +- openair1/PHY/CODING/viterbi_lte.c | 264 +- openair1/PHY/LTE_ESTIMATION/filt96_32.h | 112 +- .../PHY/LTE_ESTIMATION/freq_equalization.c | 22 +- .../lte_dl_channel_estimation.c | 133 +- .../PHY/LTE_ESTIMATION/lte_est_freq_offset.c | 31 +- .../PHY/LTE_ESTIMATION/lte_sync_timefreq.c | 8 +- .../PHY/LTE_ESTIMATION/lte_ue_measurements.c | 90 +- .../lte_ul_channel_estimation.c | 92 +- openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c | 4 +- openair1/PHY/LTE_TRANSPORT/dci.c | 152 +- openair1/PHY/LTE_TRANSPORT/defs.h | 2 +- openair1/PHY/LTE_TRANSPORT/dlsch_coding.c | 18 +- .../PHY/LTE_TRANSPORT/dlsch_demodulation.c | 1274 ++++---- .../PHY/LTE_TRANSPORT/dlsch_llr_computation.c | 2291 +++++++------- openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c | 4 +- openair1/PHY/LTE_TRANSPORT/pbch.c | 66 +- openair1/PHY/LTE_TRANSPORT/pmch.c | 173 +- openair1/PHY/LTE_TRANSPORT/prach.c | 5 +- openair1/PHY/LTE_TRANSPORT/proto.h | 4 +- .../PHY/LTE_TRANSPORT/ulsch_demodulation.c | 428 ++- openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c | 23 +- openair1/PHY/MODULATION/slot_fep.c | 9 +- openair1/PHY/MODULATION/ul_7_5_kHz.c | 446 +-- openair1/PHY/TOOLS/cdot_prod.c | 49 +- openair1/PHY/TOOLS/cmult_sv.c | 355 +-- openair1/PHY/TOOLS/cmult_vv.c | 1787 +---------- openair1/PHY/TOOLS/defs.h | 47 +- openair1/PHY/TOOLS/lte_dfts.c | 2647 ++++++++++------- openair1/PHY/TOOLS/signal_energy.c | 132 +- openair1/PHY/TOOLS/time_meas.c | 9 - openair1/PHY/TOOLS/time_meas.h | 56 +- openair1/PHY/TOOLS/vars.h | 2 +- openair1/PHY/defs.h | 21 +- openair1/SIMULATION/LTE_PHY/dlsim.c | 24 +- openair1/SIMULATION/TOOLS/multipath_channel.c | 444 +-- 45 files changed, 6993 insertions(+), 6500 deletions(-) diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt index ebe5c52243..9537477913 100644 --- a/cmake_targets/CMakeLists.txt +++ b/cmake_targets/CMakeLists.txt @@ -126,7 +126,7 @@ add_list_string_option(CMAKE_BUILD_TYPE "RelWithDebInfo" "Choose the type of bui Message("Architecture is ${CMAKE_SYSTEM_PROCESSOR}") if (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l") - set(C_FLAGS_PROCESSOR "-mfloat-abi=softfp -mfpu=neon") + set(C_FLAGS_PROCESSOR "-gdwarf-2 -mfloat-abi=hard -mfpu=neon -lgcc -lrt") else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l") set(C_FLAGS_PROCESSOR "-msse4.2") endif() @@ -140,8 +140,8 @@ set(CMAKE_C_FLAGS # set a flag for changes in the source code # these changes are related to hardcoded path to include .h files add_definitions(-DCMAKER) -set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -ggdb -DMALLOC_CHECK_=3") -set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -ggdb -DMALLOC_CHECK_=3 -O2") +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3") +set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2") # Below has been put in comment because does not work with # SVN authentication. @@ -778,7 +778,6 @@ set(PHY_SRC ${OPENAIR1_DIR}/PHY/TOOLS/log2_approx.c ${OPENAIR1_DIR}/PHY/TOOLS/cmult_sv.c ${OPENAIR1_DIR}/PHY/TOOLS/cmult_vv.c - ${OPENAIR1_DIR}/PHY/TOOLS/cadd_vv.c ${OPENAIR1_DIR}/PHY/TOOLS/cdot_prod.c ${OPENAIR1_DIR}/PHY/TOOLS/signal_energy.c ${OPENAIR1_DIR}/PHY/TOOLS/dB_routines.c @@ -1692,7 +1691,7 @@ foreach(myExe dlsim ulsim pbchsim scansim mbmssim pdcchsim pucchsim prachsim syn ${XFORMS_SOURCE} ) target_link_libraries (${myExe} - -Wl,--start-group SIMU UTIL SCHED_LIB PHY LFDS MSC ${ITTI_LIB} -Wl,--end-group + -Wl,--start-group SIMU UTIL SCHED_LIB PHY LFDS ${ITTI_LIB} -Wl,--end-group pthread m rt ${CONFIG_LIBRARIES} ${ATLAS_LIBRARIES} ${XFORMS_LIBRARIES} ) endforeach(myExe) diff --git a/cmake_targets/lte-simulators/CMakeLists.txt b/cmake_targets/lte-simulators/CMakeLists.txt index 7e38c9d594..50a473dd4b 100644 --- a/cmake_targets/lte-simulators/CMakeLists.txt +++ b/cmake_targets/lte-simulators/CMakeLists.txt @@ -10,5 +10,6 @@ set(RANDOM_BF False) set(PBS_SIM False) set(PERFECT_CE False) set(NAS_UE False) +set(MESSAGE_CHART_GENERATOR False) include(${CMAKE_CURRENT_SOURCE_DIR}/../CMakeLists.txt) diff --git a/openair1/PHY/CODING/3gpplte.c b/openair1/PHY/CODING/3gpplte.c index 0f7dbf7468..fd33bf999c 100644 --- a/openair1/PHY/CODING/3gpplte.c +++ b/openair1/PHY/CODING/3gpplte.c @@ -31,8 +31,9 @@ author: raymond.knopp@eurecom.fr date: 10.2009 */ -#include "defs.h" -//#include "lte_interleaver_inline.h" +#ifndef TC_MAIN +//#include "defs.h" +#endif #include "extern_3GPPinterleaver.h" diff --git a/openair1/PHY/CODING/3gpplte_sse.c b/openair1/PHY/CODING/3gpplte_sse.c index 850a8ecb5f..c66a42016d 100755 --- a/openair1/PHY/CODING/3gpplte_sse.c +++ b/openair1/PHY/CODING/3gpplte_sse.c @@ -1,349 +1,531 @@ -/******************************************************************************* - OpenAirInterface - Copyright(c) 1999 - 2014 Eurecom - - OpenAirInterface is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - - OpenAirInterface is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OpenAirInterface.The full GNU General Public License is - included in this distribution in the file called "COPYING". If not, - see <http://www.gnu.org/licenses/>. - - Contact Information - OpenAirInterface Admin: openair_admin@eurecom.fr - OpenAirInterface Tech : openair_tech@eurecom.fr - OpenAirInterface Dev : openair4g-devel@eurecom.fr - - Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE - - *******************************************************************************/ -/* file: 3gpplte_sse.c - purpose: Encoding routines for implementing Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03 - author: Laurent Thomas - maintainer: raymond.knopp@eurecom.fr - date: 09.2012 -*/ -#include "defs.h" -#include "extern_3GPPinterleaver.h" -#include <stdlib.h> - -#include "PHY/sse_intrin.h" - -//#define DEBUG_TURBO_ENCODER 1 -#define CALLGRIND 1 -unsigned short threegpplte_interleaver_output; -unsigned long long threegpplte_interleaver_tmp; - -struct treillis { - union { - __m64 systematic_64[3]; - char systematic_8[24]; - }; - union { - __m64 parity1_64[3]; - char parity1_8[24]; - }; - union { - __m64 parity2_64[3]; - char parity2_8[24]; - }; - int exit_state; -} __attribute__ ((aligned(64))); - -struct treillis all_treillis[8][256]; -int all_treillis_initialized=0; - -static inline unsigned char threegpplte_rsc(unsigned char input,unsigned char *state) -{ - unsigned char output; - output = (input ^ (*state>>2) ^ (*state>>1))&1; - *state = (((input<<2)^(*state>>1))^((*state>>1)<<2)^((*state)<<2))&7; - return(output); -} - -static inline void threegpplte_rsc_termination(unsigned char *x,unsigned char *z,unsigned char *state) -{ - *z = ((*state>>2) ^ (*state)) &1; - *x = ((*state) ^ (*state>>1)) &1; - *state = (*state)>>1; -} - -void treillis_table_init(void) -{ - //struct treillis t[][]=all_treillis; - //t=memalign(16,sizeof(struct treillis)*8*256); - int i, j,b; - unsigned char v, current_state; - - // clear all_treillis - for (i=0; i<8; i++) - bzero( all_treillis[i], sizeof(all_treillis[0]) ); - - for (i=0; i<8; i++) { //all possible initial states - for (j=0; j<=255; j++) { // all possible values of a byte - current_state=i; - - for (b=0; b<8 ; b++ ) { // pre-compute the image of the byte j in _m128i vector right place - all_treillis[i][j].systematic_8[b*3]= (j&(1<<(7-b)))>>(7-b); - v=threegpplte_rsc( all_treillis[i][j].systematic_8[b*3] , - ¤t_state); - all_treillis[i][j].parity1_8[b*3+1]=v; // for the yparity1 - all_treillis[i][j].parity2_8[b*3+2]=v; // for the yparity2 - } - - all_treillis[i][j].exit_state=current_state; - } - } - - all_treillis_initialized=1; - return ; -} - - -char interleave_compact_byte(short * base_interleaver,unsigned char * input, unsigned char * output, int n) -{ - - char expandInput[768*8] __attribute__((aligned(16))); - int i,loop=n>>4; - __m128i *i_128=(__m128i *)input, *o_128=(__m128i*)expandInput; - __m128i tmp1, tmp2, tmp3, tmp4; - __m128i BIT_MASK = _mm_set_epi8( 0b00000001, - 0b00000010, - 0b00000100, - 0b00001000, - 0b00010000, - 0b00100000, - 0b01000000, - 0b10000000, - 0b00000001, - 0b00000010, - 0b00000100, - 0b00001000, - 0b00010000, - 0b00100000, - 0b01000000, - 0b10000000); - - if ((n&15) > 0) - loop++; - - for (i=0; i<loop ; i++ ) { - /* int cur_byte=i<<3; */ - /* for (b=0;b<8;b++) */ - /* expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b); */ - tmp1=_mm_load_si128(i_128++); - tmp2=_mm_unpacklo_epi8(tmp1,tmp1); - tmp3=_mm_unpacklo_epi16(tmp2,tmp2); - tmp4=_mm_unpacklo_epi32(tmp3,tmp3); - *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK); - - tmp4=_mm_unpackhi_epi32(tmp3,tmp3); - *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; - - tmp3=_mm_unpackhi_epi16(tmp2,tmp2); - tmp4=_mm_unpacklo_epi32(tmp3,tmp3); - *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; - - tmp4=_mm_unpackhi_epi32(tmp3,tmp3); - *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; - - tmp2=_mm_unpackhi_epi8(tmp1,tmp1); - tmp3=_mm_unpacklo_epi16(tmp2,tmp2); - tmp4=_mm_unpacklo_epi32(tmp3,tmp3); - *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; - - tmp4=_mm_unpackhi_epi32(tmp3,tmp3); - *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; - - tmp3=_mm_unpackhi_epi16(tmp2,tmp2); - tmp4=_mm_unpacklo_epi32(tmp3,tmp3); - *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; - - tmp4=_mm_unpackhi_epi32(tmp3,tmp3); - *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; - } - - short * ptr_intl=base_interleaver; - __m128i tmp; - int input_length_words=n>>1; - unsigned short * systematic2_ptr=(unsigned short *) output; - - // int j; - for ( i=0; i< input_length_words ; i ++ ) { - - // for (j=0;j<16;j++) printf("%d(%d).",ptr_intl[j],expandInput[ptr_intl[j]]); - // printf("\n"); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],7); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],6); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],5); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],4); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],3); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],2); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],1); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],0); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+7); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+6); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+5); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+4); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+3); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+2); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+1); - tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+0); - *systematic2_ptr++=(unsigned short)_mm_movemask_epi8(tmp); - } - - return n; -} - - - -#define _mm_expand_si128(xmmx, out, bit_mask) \ - { \ - __m128i loc_mm; \ - loc_mm=(xmmx); \ - loc_mm=_mm_and_si128(loc_mm,bit_mask); \ - out=_mm_cmpeq_epi8(loc_mm,bit_mask); \ - } - -void threegpplte_turbo_encoder(unsigned char *input, - unsigned short input_length_bytes, - unsigned char *output, - unsigned char F, - unsigned short interleaver_f1, - unsigned short interleaver_f2) -{ - - int i; - unsigned char *x; - unsigned char state0=0,state1=0; - unsigned short input_length_bits = input_length_bytes<<3; - short * base_interleaver; - - if ( all_treillis_initialized == 0 ) - treillis_table_init(); - - // look for f1 and f2 precomputed interleaver values - for (i=0; i < 188 && f1f2mat[i].nb_bits != input_length_bits; i++); - - if ( i == 188 ) { - msg("Illegal frame length!\n"); - return; - } else { - base_interleaver=il_tb+f1f2mat[i].beg_index; - } - - - unsigned char systematic2[768]; - interleave_compact_byte(base_interleaver,input,systematic2,input_length_bytes); - - __m64 *ptr_output=(__m64*) output; - unsigned char cur_s1, cur_s2; - int code_rate; - - for ( state0=state1=i=0 ; i<input_length_bytes; i++ ) { - cur_s1=input[i]; - cur_s2=systematic2[i]; - - for ( code_rate=0; code_rate<3; code_rate++) { - *ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_64[code_rate], - _mm_add_pi8(all_treillis[state0][cur_s1].parity1_64[code_rate], - all_treillis[state1][cur_s2].parity2_64[code_rate])); - } - - state0=all_treillis[state0][cur_s1].exit_state; - state1=all_treillis[state1][cur_s2].exit_state; - } - - x=output+(input_length_bits*3); - - // Trellis termination - threegpplte_rsc_termination(&x[0],&x[1],&state0); -#ifdef DEBUG_TURBO_ENCODER - printf("term: x0 %d, x1 %d, state0 %d\n",x[0],x[1],state0); -#endif //DEBUG_TURBO_ENCODER - - threegpplte_rsc_termination(&x[2],&x[3],&state0); -#ifdef DEBUG_TURBO_ENCODER - printf("term: x0 %d, x1 %d, state0 %d\n",x[2],x[3],state0); -#endif //DEBUG_TURBO_ENCODER - - threegpplte_rsc_termination(&x[4],&x[5],&state0); -#ifdef DEBUG_TURBO_ENCODER - printf("term: x0 %d, x1 %d, state0 %d\n",x[4],x[5],state0); -#endif //DEBUG_TURBO_ENCODER - - threegpplte_rsc_termination(&x[6],&x[7],&state1); - -#ifdef DEBUG_TURBO_ENCODER - printf("term: x0 %d, x1 %d, state1 %d\n",x[6],x[7],state1); -#endif //DEBUG_TURBO_ENCODER - threegpplte_rsc_termination(&x[8],&x[9],&state1); -#ifdef DEBUG_TURBO_ENCODER - printf("term: x0 %d, x1 %d, state1 %d\n",x[8],x[9],state1); -#endif //DEBUG_TURBO_ENCODER - threegpplte_rsc_termination(&x[10],&x[11],&state1); - -#ifdef DEBUG_TURBO_ENCODER - printf("term: x0 %d, x1 %d, state1 %d\n",x[10],x[11],state1); -#endif //DEBUG_TURBO_ENCODER - - _mm_empty(); - _m_empty(); -} - - - -#ifdef MAIN - -#define INPUT_LENGTH 5 -#define F1 3 -#define F2 10 - -int main(int argc,char **argv) -{ - - unsigned char input[INPUT_LENGTH],state,state2; - unsigned char output[12+(3*(INPUT_LENGTH<<3))],x,z; - int i; - unsigned char out; - - for (state=0; state<8; state++) { - for (i=0; i<2; i++) { - state2=state; - out = threegpplte_rsc(i,&state2); - printf("State (%d->%d) : (%d,%d)\n",state,state2,i,out); - } - } - - printf("\n"); - - for (state=0; state<8; state++) { - - state2=state; - threegpplte_rsc_termination(&x,&z,&state2); - printf("Termination: (%d->%d) : (%d,%d)\n",state,state2,x,z); - } - - for (i=0; i<5; i++) { - input[i] = i*219; - printf("Input %d : %x\n",i,input[i]); - } - - threegpplte_turbo_encoder(&input[0], - 5, - &output[0], - F1, - F2); - return(0); -} - -#endif // MAIN +/******************************************************************************* + OpenAirInterface + Copyright(c) 1999 - 2014 Eurecom + + OpenAirInterface is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + + OpenAirInterface is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OpenAirInterface.The full GNU General Public License is + included in this distribution in the file called "COPYING". If not, + see <http://www.gnu.org/licenses/>. + + Contact Information + OpenAirInterface Admin: openair_admin@eurecom.fr + OpenAirInterface Tech : openair_tech@eurecom.fr + OpenAirInterface Dev : openair4g-devel@eurecom.fr + + Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE + + *******************************************************************************/ +/* file: 3gpplte_sse.c + purpose: Encoding routines for implementing Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03 + author: Laurent Thomas + maintainer: raymond.knopp@eurecom.fr + date: 09.2012 +*/ +#ifndef TC_MAIN +#include "defs.h" +#include "extern_3GPPinterleaver.h" +#else +#include "vars.h" +#endif +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include "PHY/sse_intrin.h" + +#define print_bytes(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15]) +#define print_shorts(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]) +#define print_ints(s,x) printf("%s %x %x %x %x\n",s,(x)[0],(x)[1],(x)[2],(x)[3]) + + +//#define DEBUG_TURBO_ENCODER 1 +#define CALLGRIND 1 +unsigned short threegpplte_interleaver_output; +unsigned long long threegpplte_interleaver_tmp; + +#if defined(__x86_64__) || defined(__i386__) +struct treillis { + union { + __m64 systematic_64[3]; + char systematic_8[24]; + }; + union { + __m64 parity1_64[3]; + char parity1_8[24]; + }; + union { + __m64 parity2_64[3]; + char parity2_8[24]; + }; + int exit_state; +} __attribute__ ((aligned(64))); + +#elif defined(__arm__) + +struct treillis { + union { + uint8x8_t systematic_64[3]; + char systematic_8[24]; + }__attribute__((aligned(64))); + union { + uint8x8_t parity1_64[3]; + char parity1_8[24]; + }__attribute__((aligned(64))); + union { + uint8x8_t parity2_64[3]; + char parity2_8[24]; + }__attribute__((aligned(64))); + int exit_state; +}; +#endif + +struct treillis all_treillis[8][256]; +int all_treillis_initialized=0; + +static inline unsigned char threegpplte_rsc(unsigned char input,unsigned char *state) +{ + unsigned char output; + output = (input ^ (*state>>2) ^ (*state>>1))&1; + *state = (((input<<2)^(*state>>1))^((*state>>1)<<2)^((*state)<<2))&7; + return(output); +} + +static inline void threegpplte_rsc_termination(unsigned char *x,unsigned char *z,unsigned char *state) +{ + *z = ((*state>>2) ^ (*state)) &1; + *x = ((*state) ^ (*state>>1)) &1; + *state = (*state)>>1; +} + +void treillis_table_init(void) +{ + //struct treillis t[][]=all_treillis; + //t=memalign(16,sizeof(struct treillis)*8*256); + int i, j,b; + unsigned char v, current_state; + + // clear all_treillis + for (i=0; i<8; i++) + bzero( all_treillis[i], sizeof(all_treillis[0]) ); + + for (i=0; i<8; i++) { //all possible initial states + for (j=0; j<=255; j++) { // all possible values of a byte + current_state=i; + + for (b=0; b<8 ; b++ ) { // pre-compute the image of the byte j in _m128i vector right place + all_treillis[i][j].systematic_8[b*3]= (j&(1<<(7-b)))>>(7-b); + v=threegpplte_rsc( all_treillis[i][j].systematic_8[b*3] , + ¤t_state); + all_treillis[i][j].parity1_8[b*3+1]=v; // for the yparity1 + all_treillis[i][j].parity2_8[b*3+2]=v; // for the yparity2 + } + + all_treillis[i][j].exit_state=current_state; + } + } + + all_treillis_initialized=1; + return ; +} + + +char interleave_compact_byte(short * base_interleaver,unsigned char * input, unsigned char * output, int n) +{ + + char expandInput[768*8] __attribute__((aligned(16))); + int i,loop=n>>4; +#if defined(__x86_64__) || defined(__i386__) + __m128i *i_128=(__m128i *)input, *o_128=(__m128i*)expandInput; + __m128i tmp1, tmp2, tmp3, tmp4; + __m128i BIT_MASK = _mm_set_epi8( 0b00000001, + 0b00000010, + 0b00000100, + 0b00001000, + 0b00010000, + 0b00100000, + 0b01000000, + 0b10000000, + 0b00000001, + 0b00000010, + 0b00000100, + 0b00001000, + 0b00010000, + 0b00100000, + 0b01000000, + 0b10000000); +#elif defined(__arm__) + uint8x16_t *i_128=(uint8x16_t *)input, *o_128=(uint8x16_t *)expandInput; + uint8x16_t tmp1,tmp2; + uint16x8_t tmp3; + uint32x4_t tmp4; + uint8x16_t and_tmp; + uint8x16_t BIT_MASK = { 0b10000000, + 0b01000000, + 0b00100000, + 0b00010000, + 0b00001000, + 0b00000100, + 0b00000010, + 0b00000001, + 0b10000000, + 0b01000000, + 0b00100000, + 0b00010000, + 0b00001000, + 0b00000100, + 0b00000010, + 0b00000001}; +#endif + if ((n&15) > 0) + loop++; + + for (i=0; i<loop ; i++ ) { + /* int cur_byte=i<<3; */ + /* for (b=0;b<8;b++) */ + /* expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b); */ + +#if defined(__x86_64__) || defined(__i386__) + tmp1=_mm_load_si128(i_128++); + tmp2=_mm_unpacklo_epi8(tmp1,tmp1); + tmp3=_mm_unpacklo_epi16(tmp2,tmp2); + tmp4=_mm_unpacklo_epi32(tmp3,tmp3); + *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK); + + tmp4=_mm_unpackhi_epi32(tmp3,tmp3); + *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; + + tmp3=_mm_unpackhi_epi16(tmp2,tmp2); + tmp4=_mm_unpacklo_epi32(tmp3,tmp3); + *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; + + tmp4=_mm_unpackhi_epi32(tmp3,tmp3); + *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; + + tmp2=_mm_unpackhi_epi8(tmp1,tmp1); + tmp3=_mm_unpacklo_epi16(tmp2,tmp2); + tmp4=_mm_unpacklo_epi32(tmp3,tmp3); + *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; + + tmp4=_mm_unpackhi_epi32(tmp3,tmp3); + *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; + + tmp3=_mm_unpackhi_epi16(tmp2,tmp2); + tmp4=_mm_unpacklo_epi32(tmp3,tmp3); + *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; + + tmp4=_mm_unpackhi_epi32(tmp3,tmp3); + *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; + +#elif defined(__arm__) + tmp1=vld1q_u8((uint8_t*)i_128); + //print_bytes("tmp1:",(uint8_t*)&tmp1); + + uint8x16x2_t temp1 = vzipq_u8(tmp1,tmp1); + tmp2 = temp1.val[0]; + + uint16x8x2_t temp2 = vzipq_u16((uint16x8_t)tmp2,(uint16x8_t)tmp2); + tmp3 = temp2.val[0]; + + uint32x4x2_t temp3 = vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3); + tmp4 = temp3.val[0]; + //print_bytes("tmp4:",(uint8_t*)&tmp4); + + *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //1 + //print_bytes("o:",(uint8_t*)(o_128-1)); + + tmp4 = temp3.val[1]; + //print_bytes("tmp4:",(uint8_t*)&tmp4); + + *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //2 + //print_bytes("o:",(uint8_t*)(o_128-1)); + + tmp3 = temp2.val[1]; + temp3 = vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3); + tmp4 = temp3.val[0]; + //print_bytes("tmp4:",(uint8_t*)&tmp4); + + *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //3 + //print_bytes("o:",(uint8_t*)(o_128-1)); + + tmp4 = temp3.val[1]; + //print_bytes("tmp4:",(uint8_t*)&tmp4); + + *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //4 + //and_tmp = vandq_u8((uint8x16_t)tmp4,BIT_MASK); print_bytes("and:",and_tmp); + //print_bytes("o:",(uint8_t*)(o_128-1)); + + + temp1 = vzipq_u8(tmp1,tmp1); + tmp2 = temp1.val[1]; + temp2 = vzipq_u16((uint16x8_t)tmp2,(uint16x8_t)tmp2); + tmp3 = temp2.val[0]; + temp3 = vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3); + tmp4 = temp3.val[0]; + //print_bytes("tmp4:",(uint8_t*)&tmp4); + + *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //5 + //print_bytes("o:",(uint8_t*)(o_128-1)); + + tmp4 = temp3.val[1]; + //print_bytes("tmp4:",(uint8_t*)&tmp4); + + *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //6 + //print_bytes("o:",(uint8_t*)(o_128-1)); + + + temp2 = vzipq_u16((uint16x8_t)tmp2,(uint16x8_t)tmp2); + tmp3 = temp2.val[1]; + temp3 = vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3); + tmp4 = temp3.val[0]; + //print_bytes("tmp4:",(uint8_t*)&tmp4); + + *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //7 + //print_bytes("o:",(uint8_t*)(o_128-1)); + + tmp4 = temp3.val[1]; + //print_bytes("tmp4:",(uint8_t*)&tmp4); + + *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //7 + //print_bytes("o:",(uint8_t*)(o_128-1)); + + i_128++; +#endif + } + + short * ptr_intl=base_interleaver; +#if defined(__x86_64) || defined(__i386__) + __m128i tmp; + uint16_t *systematic2_ptr=(unsigned short *) output; +#elif defined(__arm__) + uint8x16_t tmp; + const uint8_t __attribute__ ((aligned (16))) _Powers[16]= + { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + +// Set the powers of 2 (do it once for all, if applicable) + uint8x16_t Powers= vld1q_u8(_Powers); + uint8_t *systematic2_ptr=(uint8_t *) output; +#endif + int input_length_words=n>>1; + + for ( i=0; i< input_length_words ; i ++ ) { + +#if defined(__x86_64__) || defined(__i386__) + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],7); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],6); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],5); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],4); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],3); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],2); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],1); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],0); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+7); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+6); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+5); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+4); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+3); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+2); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+1); + tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+0); + *systematic2_ptr++=(unsigned short)_mm_movemask_epi8(tmp); +#elif defined(__arm__) + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,7); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,6); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,5); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,4); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,3); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,2); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,1); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,0); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+7); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+6); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+5); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+4); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+3); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+2); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+1); + tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+0); +// Compute the mask from the input + uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers)))); + vst1q_lane_u8(systematic2_ptr++, (uint8x16_t)Mask, 0); + vst1q_lane_u8(systematic2_ptr++, (uint8x16_t)Mask, 8); + +#endif + } + + return n; +} + + +/* +#define _mm_expand_si128(xmmx, out, bit_mask) \ + { \ + __m128i loc_mm; \ + loc_mm=(xmmx); \ + loc_mm=_mm_and_si128(loc_mm,bit_mask); \ + out=_mm_cmpeq_epi8(loc_mm,bit_mask); \ + } +*/ + +void threegpplte_turbo_encoder(unsigned char *input, + unsigned short input_length_bytes, + unsigned char *output, + unsigned char F, + unsigned short interleaver_f1, + unsigned short interleaver_f2) +{ + + int i; + unsigned char *x; + unsigned char state0=0,state1=0; + unsigned short input_length_bits = input_length_bytes<<3; + short * base_interleaver; + + if ( all_treillis_initialized == 0 ) + treillis_table_init(); + + // look for f1 and f2 precomputed interleaver values + for (i=0; i < 188 && f1f2mat[i].nb_bits != input_length_bits; i++); + + if ( i == 188 ) { + printf("Illegal frame length!\n"); + return; + } else { + base_interleaver=il_tb+f1f2mat[i].beg_index; + } + + + unsigned char systematic2[768]; + interleave_compact_byte(base_interleaver,input,systematic2,input_length_bytes); + +#if defined(__x86_64__) || defined(__i386__) + __m64 *ptr_output=(__m64*) output; +#elif defined(__arm__) + uint8x8_t *ptr_output=(uint8x8_t*)output; +#endif + unsigned char cur_s1, cur_s2; + int code_rate; + + for ( state0=state1=i=0 ; i<input_length_bytes; i++ ) { + cur_s1=input[i]; + cur_s2=systematic2[i]; + + for ( code_rate=0; code_rate<3; code_rate++) { +#if defined(__x86_64__) || defined(__i386__) + *ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_64[code_rate], + _mm_add_pi8(all_treillis[state0][cur_s1].parity1_64[code_rate], + all_treillis[state1][cur_s2].parity2_64[code_rate])); +#elif defined(__arm__) + uint8x8_t ptmp = vadd_u8(all_treillis[state0][cur_s1].parity1_64[code_rate], + all_treillis[state1][cur_s2].parity2_64[code_rate]); + *ptr_output++ = vadd_u8(all_treillis[state0][cur_s1].systematic_64[code_rate], + ptmp); +#endif + } + + state0=all_treillis[state0][cur_s1].exit_state; + state1=all_treillis[state1][cur_s2].exit_state; + } + + x=output+(input_length_bits*3); + + // Trellis termination + threegpplte_rsc_termination(&x[0],&x[1],&state0); +#ifdef DEBUG_TURBO_ENCODER + printf("term: x0 %d, x1 %d, state0 %d\n",x[0],x[1],state0); +#endif //DEBUG_TURBO_ENCODER + + threegpplte_rsc_termination(&x[2],&x[3],&state0); +#ifdef DEBUG_TURBO_ENCODER + printf("term: x0 %d, x1 %d, state0 %d\n",x[2],x[3],state0); +#endif //DEBUG_TURBO_ENCODER + + threegpplte_rsc_termination(&x[4],&x[5],&state0); +#ifdef DEBUG_TURBO_ENCODER + printf("term: x0 %d, x1 %d, state0 %d\n",x[4],x[5],state0); +#endif //DEBUG_TURBO_ENCODER + + threegpplte_rsc_termination(&x[6],&x[7],&state1); + +#ifdef DEBUG_TURBO_ENCODER + printf("term: x0 %d, x1 %d, state1 %d\n",x[6],x[7],state1); +#endif //DEBUG_TURBO_ENCODER + threegpplte_rsc_termination(&x[8],&x[9],&state1); +#ifdef DEBUG_TURBO_ENCODER + printf("term: x0 %d, x1 %d, state1 %d\n",x[8],x[9],state1); +#endif //DEBUG_TURBO_ENCODER + threegpplte_rsc_termination(&x[10],&x[11],&state1); + +#ifdef DEBUG_TURBO_ENCODER + printf("term: x0 %d, x1 %d, state1 %d\n",x[10],x[11],state1); +#endif //DEBUG_TURBO_ENCODER +#if defined(__x86_64__) || defined(__i386__) + _mm_empty(); + _m_empty(); +#endif +} + + + +#ifdef TC_MAIN +#define INPUT_LENGTH 20 +#define F1 21 +#define F2 120 + +int main(int argc,char **argv) +{ + + unsigned char input[INPUT_LENGTH+16],state,state2; + unsigned char output[12+(3*(INPUT_LENGTH<<3))],x,z; + int i; + unsigned char out; + + for (state=0; state<8; state++) { + for (i=0; i<2; i++) { + state2=state; + out = threegpplte_rsc(i,&state2); + printf("State (%d->%d) : (%d,%d)\n",state,state2,i,out); + } + } + + printf("\n"); + + for (state=0; state<8; state++) { + + state2=state; + threegpplte_rsc_termination(&x,&z,&state2); + printf("Termination: (%d->%d) : (%d,%d)\n",state,state2,x,z); + } + + memset((void*)input,0,INPUT_LENGTH+16); + for (i=0; i<INPUT_LENGTH; i++) { + input[i] = i*219; + printf("Input %d : %x\n",i,input[i]); + } + + threegpplte_turbo_encoder(&input[0], + INPUT_LENGTH, + &output[0], + 0, + F1, + F2); + + + for (i=0;i<12+(INPUT_LENGTH*24);i++) + printf("%d",output[i]); + printf("\n"); + + return(0); +} + +#endif // MAIN diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c index 66c5e9ace7..96813c1072 100644 --- a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c +++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c @@ -64,6 +64,8 @@ #endif +#define print_shorts(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]) + //#define DEBUG_LOGMAP @@ -120,10 +122,17 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity unsigned short frame_length,unsigned char term_flag) { int k,K1; +#if defined(__x86_64__)||defined(__i386__) __m128i *systematic128 = (__m128i *)systematic; __m128i *y_parity128 = (__m128i *)y_parity; __m128i *m10_128 = (__m128i *)m10; __m128i *m11_128 = (__m128i *)m11; +#elif defined(__arm__) + int16x8_t *systematic128 = (int16x8_t *)systematic; + int16x8_t *y_parity128 = (int16x8_t *)y_parity; + int16x8_t *m10_128 = (int16x8_t *)m10; + int16x8_t *m11_128 = (int16x8_t *)m11; +#endif #ifdef DEBUG_LOGMAP msg("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length); @@ -132,61 +141,31 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity K1=frame_length>>3; for (k=0; k<K1; k++) { - +#if defined(__x86_64__) || defined(__i386__) m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k],y_parity128[k]),1); m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k],y_parity128[k]),1); - /* - printf("gamma %d: s %d,%d,%d,%d,%d,%d,%d,%d\n", - k, - (int16_t)_mm_extract_epi16(systematic128[k],0), - (int16_t)_mm_extract_epi16(systematic128[k],1), - (int16_t)_mm_extract_epi16(systematic128[k],2), - (int16_t)_mm_extract_epi16(systematic128[k],3), - (int16_t)_mm_extract_epi16(systematic128[k],4), - (int16_t)_mm_extract_epi16(systematic128[k],5), - (int16_t)_mm_extract_epi16(systematic128[k],6), - (int16_t)_mm_extract_epi16(systematic128[k],7)); - - printf("gamma %d: yp %d,%d,%d,%d,%d,%d,%d,%d\n", - k, - (int16_t)_mm_extract_epi16(y_parity128[k],0), - (int16_t)_mm_extract_epi16(y_parity128[k],1), - (int16_t)_mm_extract_epi16(y_parity128[k],2), - (int16_t)_mm_extract_epi16(y_parity128[k],3), - (int16_t)_mm_extract_epi16(y_parity128[k],4), - (int16_t)_mm_extract_epi16(y_parity128[k],5), - (int16_t)_mm_extract_epi16(y_parity128[k],6), - (int16_t)_mm_extract_epi16(y_parity128[k],7)); - - printf("gamma %d: m11 %d,%d,%d,%d,%d,%d,%d,%d\n", - k, - (int16_t)_mm_extract_epi16(m11_128[k],0), - (int16_t)_mm_extract_epi16(m11_128[k],1), - (int16_t)_mm_extract_epi16(m11_128[k],2), - (int16_t)_mm_extract_epi16(m11_128[k],3), - (int16_t)_mm_extract_epi16(m11_128[k],4), - (int16_t)_mm_extract_epi16(m11_128[k],5), - (int16_t)_mm_extract_epi16(m11_128[k],6), - (int16_t)_mm_extract_epi16(m11_128[k],7)); - printf("gamma %d: m10 %d,%d,%d,%d,%d,%d,%d,%d\n", - k, - (int16_t)_mm_extract_epi16(m10_128[k],0), - (int16_t)_mm_extract_epi16(m10_128[k],1), - (int16_t)_mm_extract_epi16(m10_128[k],2), - (int16_t)_mm_extract_epi16(m10_128[k],3), - (int16_t)_mm_extract_epi16(m10_128[k],4), - (int16_t)_mm_extract_epi16(m10_128[k],5), - (int16_t)_mm_extract_epi16(m10_128[k],6), - (int16_t)_mm_extract_epi16(m10_128[k],7)); - */ +#elif defined(__arm__) + m11_128[k] = vhaddq_s16(systematic128[k],y_parity128[k]); + m10_128[k] = vhsubq_s16(systematic128[k],y_parity128[k]); +#endif +#ifdef DEBUG_LOGMAP + printf("Loop index k, m11,m10\n"); + print_shorts("sys",(int16_t*)&systematic128[k]); + print_shorts("yp",(int16_t*)&y_parity128[k]); + print_shorts("m11",(int16_t*)&m11_128[k]); + print_shorts("m10",(int16_t*)&m10_128[k]); +#endif } // Termination +#if defined(__x86_64__) || defined(__i386__) m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1); m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1); - - // printf("gamma (term): %d,%d, %d,%d, %d,%d\n",m11[k<<3],m10[k<<3],m11[1+(k<<3)],m10[1+(k<<3)],m11[2+(k<<3)],m10[2+(k<<3)]); +#elif defined(__arm__) + m11_128[k] = vhaddq_s16(systematic128[k+term_flag],y_parity128[k]); + m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]); +#endif } #define L 40 @@ -194,19 +173,31 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned short frame_length,unsigned char F) { int k,l,l2,K1,rerun_flag=0; +#if defined(__x86_64__) || defined(__i386__) __m128i *alpha128=(__m128i *)alpha,*alpha_ptr; __m128i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p; __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; __m128i new0,new1,new2,new3,new4,new5,new6,new7; __m128i alpha_max; - +#elif defined(__arm__) + int16x8_t *alpha128=(int16x8_t *)alpha,*alpha_ptr; + int16x8_t a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p; + int16x8_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; + int16x8_t new0,new1,new2,new3,new4,new5,new6,new7; + int16x8_t alpha_max; +#endif l2 = L>>3; K1 = (frame_length>>3); for (l=K1;; l=l2,rerun_flag=1) { +#if defined(__x86_64__) || defined(__i386__) alpha128 = (__m128i *)alpha; +#elif defined(__arm__) + alpha128 = (int16x8_t *)alpha; +#endif if (rerun_flag == 0) { +#if defined(__x86_64__) || defined(__i386__) alpha128[0] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0); alpha128[1] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2); alpha128[2] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2); @@ -215,8 +206,31 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s alpha128[5] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2); alpha128[6] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2); alpha128[7] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2); +#elif defined(__arm__) + alpha128[0] = vdupq_n_s16(-MAX/2); + alpha128[0] = vsetq_lane_s16(0,alpha128[0],0); + alpha128[1] = vdupq_n_s16(-MAX/2); + alpha128[2] = vdupq_n_s16(-MAX/2); + alpha128[3] = vdupq_n_s16(-MAX/2); + alpha128[4] = vdupq_n_s16(-MAX/2); + alpha128[5] = vdupq_n_s16(-MAX/2); + alpha128[6] = vdupq_n_s16(-MAX/2); + alpha128[7] = vdupq_n_s16(-MAX/2); +#endif +#ifdef DEBUG_LOGMAP + printf("Initial alpha\n"); + print_shorts("a0",(int16_t*)&alpha128[0]); + print_shorts("a1",(int16_t*)&alpha128[1]); + print_shorts("a2",(int16_t*)&alpha128[2]); + print_shorts("a3",(int16_t*)&alpha128[3]); + print_shorts("a4",(int16_t*)&alpha128[4]); + print_shorts("a5",(int16_t*)&alpha128[5]); + print_shorts("a6",(int16_t*)&alpha128[6]); + print_shorts("a7",(int16_t*)&alpha128[7]); +#endif } else { //set initial alpha in columns 1-7 from final alpha from last run in columns 0-6 +#if defined(__x86_64__) || defined(__i386__) alpha128[0] = _mm_slli_si128(alpha128[frame_length],2); alpha128[1] = _mm_slli_si128(alpha128[1+frame_length],2); alpha128[2] = _mm_slli_si128(alpha128[2+frame_length],2); @@ -225,6 +239,16 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s alpha128[5] = _mm_slli_si128(alpha128[5+frame_length],2); alpha128[6] = _mm_slli_si128(alpha128[6+frame_length],2); alpha128[7] = _mm_slli_si128(alpha128[7+frame_length],2); +#elif defined(__arm__) + alpha128[0] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[frame_length],16); alpha128[0] = vsetq_lane_s16(alpha[8],alpha128[0],3); + alpha128[1] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[1+frame_length],16); alpha128[1] = vsetq_lane_s16(alpha[24],alpha128[0],3); + alpha128[2] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[2+frame_length],16); alpha128[2] = vsetq_lane_s16(alpha[40],alpha128[0],3); + alpha128[3] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[3+frame_length],16); alpha128[3] = vsetq_lane_s16(alpha[56],alpha128[0],3); + alpha128[4] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[4+frame_length],16); alpha128[4] = vsetq_lane_s16(alpha[72],alpha128[0],3); + alpha128[5] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[5+frame_length],16); alpha128[5] = vsetq_lane_s16(alpha[88],alpha128[0],3); + alpha128[6] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[6+frame_length],16); alpha128[6] = vsetq_lane_s16(alpha[104],alpha128[0],3); + alpha128[7] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[7+frame_length],16); alpha128[7] = vsetq_lane_s16(alpha[120],alpha128[0],3); +#endif // set initial alpha in column 0 to (0,-MAX/2,...,-MAX/2) alpha[8] = -MAX/2; alpha[16] = -MAX/2; @@ -233,17 +257,33 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s alpha[40] = -MAX/2; alpha[48] = -MAX/2; alpha[56] = -MAX/2; +#ifdef DEBUG_LOGMAP + printf("Second run\n"); + print_shorts("a0",(int16_t*)&alpha128[0]); + print_shorts("a1",(int16_t*)&alpha128[1]); + print_shorts("a2",(int16_t*)&alpha128[2]); + print_shorts("a3",(int16_t*)&alpha128[3]); + print_shorts("a4",(int16_t*)&alpha128[4]); + print_shorts("a5",(int16_t*)&alpha128[5]); + print_shorts("a6",(int16_t*)&alpha128[6]); + print_shorts("a7",(int16_t*)&alpha128[7]); +#endif + } alpha_ptr = &alpha128[0]; - +#if defined(__x86_64__) || defined(__i386__) m11p = (__m128i*)m_11; m10p = (__m128i*)m_10; - +#elif defined(__arm__) + m11p = (int16x8_t*)m_11; + m10p = (int16x8_t*)m_10; +#endif for (k=0; k<l; k++) { +#if defined(__x86_64__) || defined(__i386__) a1=_mm_load_si128(&alpha_ptr[1]); a3=_mm_load_si128(&alpha_ptr[3]); a5=_mm_load_si128(&alpha_ptr[5]); @@ -288,10 +328,48 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s alpha_max = _mm_max_epi16(alpha_max,a5); alpha_max = _mm_max_epi16(alpha_max,a6); alpha_max = _mm_max_epi16(alpha_max,a7); +#elif defined(__arm__) + m_b0 = vqaddq_s16(alpha_ptr[1],*m11p); // m11 + m_b4 = vqsubq_s16(alpha_ptr[1],*m11p); // m00=-m11 + m_b1 = vqsubq_s16(alpha_ptr[3],*m10p); // m01=-m10 + m_b5 = vqaddq_s16(alpha_ptr[3],*m10p); // m10 + m_b2 = vqaddq_s16(alpha_ptr[5],*m10p); // m10 + m_b6 = vqsubq_s16(alpha_ptr[5],*m10p); // m01=-m10 + m_b3 = vqsubq_s16(alpha_ptr[7],*m11p); // m00=-m11 + m_b7 = vqaddq_s16(alpha_ptr[7],*m11p); // m11 + + new0 = vqsubq_s16(alpha_ptr[0],*m11p); // m00=-m11 + new4 = vqaddq_s16(alpha_ptr[0],*m11p); // m11 + new1 = vqaddq_s16(alpha_ptr[2],*m10p); // m10 + new5 = vqsubq_s16(alpha_ptr[2],*m10p); // m01=-m10 + new2 = vqsubq_s16(alpha_ptr[4],*m10p); // m01=-m10 + new6 = vqaddq_s16(alpha_ptr[4],*m10p); // m10 + new3 = vqaddq_s16(alpha_ptr[6],*m11p); // m11 + new7 = vqsubq_s16(alpha_ptr[6],*m11p); // m00=-m11 + a0 = vmaxq_s16(m_b0,new0); + a1 = vmaxq_s16(m_b1,new1); + a2 = vmaxq_s16(m_b2,new2); + a3 = vmaxq_s16(m_b3,new3); + a4 = vmaxq_s16(m_b4,new4); + a5 = vmaxq_s16(m_b5,new5); + a6 = vmaxq_s16(m_b6,new6); + a7 = vmaxq_s16(m_b7,new7); + + // compute and subtract maxima + alpha_max = vmaxq_s16(a0,a1); + alpha_max = vmaxq_s16(alpha_max,a2); + alpha_max = vmaxq_s16(alpha_max,a3); + alpha_max = vmaxq_s16(alpha_max,a4); + alpha_max = vmaxq_s16(alpha_max,a5); + alpha_max = vmaxq_s16(alpha_max,a6); + alpha_max = vmaxq_s16(alpha_max,a7); + +#endif alpha_ptr+=8; m11p++; m10p++; +#if defined(__x86_64__) || defined(__i386__) alpha_ptr[0] = _mm_subs_epi16(a0,alpha_max); alpha_ptr[1] = _mm_subs_epi16(a1,alpha_max); alpha_ptr[2] = _mm_subs_epi16(a2,alpha_max); @@ -300,6 +378,58 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s alpha_ptr[5] = _mm_subs_epi16(a5,alpha_max); alpha_ptr[6] = _mm_subs_epi16(a6,alpha_max); alpha_ptr[7] = _mm_subs_epi16(a7,alpha_max); +#elif defined(__arm__) + alpha_ptr[0] = vqsubq_s16(a0,alpha_max); + alpha_ptr[1] = vqsubq_s16(a1,alpha_max); + alpha_ptr[2] = vqsubq_s16(a2,alpha_max); + alpha_ptr[3] = vqsubq_s16(a3,alpha_max); + alpha_ptr[4] = vqsubq_s16(a4,alpha_max); + alpha_ptr[5] = vqsubq_s16(a5,alpha_max); + alpha_ptr[6] = vqsubq_s16(a6,alpha_max); + alpha_ptr[7] = vqsubq_s16(a7,alpha_max); +#endif + +#ifdef DEBUG_LOGMAP + printf("Loop index %d, mb\n",k); + print_shorts("mb0",(int16_t*)&m_b0); + print_shorts("mb1",(int16_t*)&m_b1); + print_shorts("mb2",(int16_t*)&m_b2); + print_shorts("mb3",(int16_t*)&m_b3); + print_shorts("mb4",(int16_t*)&m_b4); + print_shorts("mb5",(int16_t*)&m_b5); + print_shorts("mb6",(int16_t*)&m_b6); + print_shorts("mb7",(int16_t*)&m_b7); + + printf("Loop index %d, new\n",k); + print_shorts("new0",(int16_t*)&new0); + print_shorts("new1",(int16_t*)&new1); + print_shorts("new2",(int16_t*)&new2); + print_shorts("new3",(int16_t*)&new3); + print_shorts("new4",(int16_t*)&new4); + print_shorts("new5",(int16_t*)&new5); + print_shorts("new6",(int16_t*)&new6); + print_shorts("new7",(int16_t*)&new7); + + printf("Loop index %d, after max\n",k); + print_shorts("a0",(int16_t*)&a0); + print_shorts("a1",(int16_t*)&a1); + print_shorts("a2",(int16_t*)&a2); + print_shorts("a3",(int16_t*)&a3); + print_shorts("a4",(int16_t*)&a4); + print_shorts("a5",(int16_t*)&a5); + print_shorts("a6",(int16_t*)&a6); + print_shorts("a7",(int16_t*)&a7); + + printf("Loop index %d\n",k); + print_shorts("a0",(int16_t*)&alpha_ptr[0]); + print_shorts("a1",(int16_t*)&alpha_ptr[1]); + print_shorts("a2",(int16_t*)&alpha_ptr[2]); + print_shorts("a3",(int16_t*)&alpha_ptr[3]); + print_shorts("a4",(int16_t*)&alpha_ptr[4]); + print_shorts("a5",(int16_t*)&alpha_ptr[5]); + print_shorts("a6",(int16_t*)&alpha_ptr[6]); + print_shorts("a7",(int16_t*)&alpha_ptr[7]); +#endif } @@ -313,12 +443,22 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh { int k,rerun_flag=0; +#if defined(__x86_64__) || defined(__i386__) __m128i m11_128,m10_128; __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; __m128i new0,new1,new2,new3,new4,new5,new6,new7; __m128i *beta128,*alpha128,*beta_ptr; __m128i beta_max; +#elif defined(__arm__) + int16x8_t m11_128,m10_128; + int16x8_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; + int16x8_t new0,new1,new2,new3,new4,new5,new6,new7; + + int16x8_t *beta128,*alpha128,*beta_ptr; + int16x8_t beta_max; +#endif + int16_t m11,m10,beta0_16,beta1_16,beta2_16,beta3_16,beta4_16,beta5_16,beta6_16,beta7_16,beta0_2,beta1_2,beta2_2,beta3_2,beta_m; llr_t beta0,beta1; @@ -380,9 +520,13 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh beta7_16=beta7_16-beta_m; for (rerun_flag=0;; rerun_flag=1) { +#if defined(__x86_64__) || defined(__i386__) beta_ptr = (__m128i*)&beta[frame_length<<3]; alpha128 = (__m128i*)&alpha[0]; - +#elif defined(__arm__) + beta_ptr = (int16x8_t*)&beta[frame_length<<3]; + alpha128 = (int16x8_t*)&alpha[0]; +#endif if (rerun_flag == 0) { beta_ptr[0] = alpha128[(frame_length)]; beta_ptr[1] = alpha128[1+(frame_length)]; @@ -393,6 +537,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh beta_ptr[6] = alpha128[6+(frame_length)]; beta_ptr[7] = alpha128[7+(frame_length)]; } else { +#if defined(__x86_64__) || defined(__i386__) beta128 = (__m128i*)&beta[0]; beta_ptr[0] = _mm_srli_si128(beta128[0],2); beta_ptr[1] = _mm_srli_si128(beta128[1],2); @@ -402,9 +547,22 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh beta_ptr[5] = _mm_srli_si128(beta128[5],2); beta_ptr[6] = _mm_srli_si128(beta128[6],2); beta_ptr[7] = _mm_srli_si128(beta128[7],2); +#elif defined(__arm__) + beta128 = (int16x8_t*)&beta[0]; + beta_ptr = (int16x8_t*)&beta[frame_length<<3]; + beta_ptr[0] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[0],16); beta_ptr[0] = vsetq_lane_s16(beta[3],beta_ptr[0],4); + beta_ptr[1] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[1],16); beta_ptr[1] = vsetq_lane_s16(beta[11],beta_ptr[1],4); + beta_ptr[2] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[2],16); beta_ptr[2] = vsetq_lane_s16(beta[19],beta_ptr[2],4); + beta_ptr[3] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[3],16); beta_ptr[3] = vsetq_lane_s16(beta[27],beta_ptr[3],4); + beta_ptr[4] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[4],16); beta_ptr[4] = vsetq_lane_s16(beta[35],beta_ptr[4],4); + beta_ptr[5] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[5],16); beta_ptr[5] = vsetq_lane_s16(beta[43],beta_ptr[5],4); + beta_ptr[6] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[6],16); beta_ptr[6] = vsetq_lane_s16(beta[51],beta_ptr[6],4); + beta_ptr[7] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[7],16); beta_ptr[7] = vsetq_lane_s16(beta[59],beta_ptr[7],4); +#endif } +#if defined(__x86_64__) || defined(__i386__) beta_ptr[0] = _mm_insert_epi16(beta_ptr[0],beta0_16,7); beta_ptr[1] = _mm_insert_epi16(beta_ptr[1],beta1_16,7); beta_ptr[2] = _mm_insert_epi16(beta_ptr[2],beta2_16,7); @@ -413,10 +571,21 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh beta_ptr[5] = _mm_insert_epi16(beta_ptr[5],beta5_16,7); beta_ptr[6] = _mm_insert_epi16(beta_ptr[6],beta6_16,7); beta_ptr[7] = _mm_insert_epi16(beta_ptr[7],beta7_16,7); +#elif defined(__arm__) + beta_ptr[0] = vsetq_lane_s16(beta0_16,beta_ptr[0],7); + beta_ptr[1] = vsetq_lane_s16(beta1_16,beta_ptr[1],7); + beta_ptr[2] = vsetq_lane_s16(beta2_16,beta_ptr[2],7); + beta_ptr[3] = vsetq_lane_s16(beta3_16,beta_ptr[3],7); + beta_ptr[4] = vsetq_lane_s16(beta4_16,beta_ptr[4],7); + beta_ptr[5] = vsetq_lane_s16(beta5_16,beta_ptr[5],7); + beta_ptr[6] = vsetq_lane_s16(beta6_16,beta_ptr[6],7); + beta_ptr[7] = vsetq_lane_s16(beta7_16,beta_ptr[7],7); +#endif int loopval=((rerun_flag==0)?0:((frame_length-L)>>3)); for (k=(frame_length>>3)-1; k>=loopval; k--) { +#if defined(__x86_64__) || defined(__i386__) m11_128=((__m128i*)m_11)[k]; m10_128=((__m128i*)m_10)[k]; @@ -465,7 +634,55 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh beta_ptr[5] = _mm_subs_epi16(beta_ptr[5],beta_max); beta_ptr[6] = _mm_subs_epi16(beta_ptr[6],beta_max); beta_ptr[7] = _mm_subs_epi16(beta_ptr[7],beta_max); +#elif defined(__arm__) + m11_128=((int16x8_t*)m_11)[k]; + m10_128=((int16x8_t*)m_10)[k]; + m_b0 = vqaddq_s16(beta_ptr[4],m11_128); //m11 + m_b1 = vqsubq_s16(beta_ptr[4],m11_128); //m00 + m_b2 = vqsubq_s16(beta_ptr[5],m10_128); //m01 + m_b3 = vqaddq_s16(beta_ptr[5],m10_128); //m10 + m_b4 = vqaddq_s16(beta_ptr[6],m10_128); //m10 + m_b5 = vqsubq_s16(beta_ptr[6],m10_128); //m01 + m_b6 = vqsubq_s16(beta_ptr[7],m11_128); //m00 + m_b7 = vqaddq_s16(beta_ptr[7],m11_128); //m11 + + new0 = vqsubq_s16(beta_ptr[0],m11_128); //m00 + new1 = vqaddq_s16(beta_ptr[0],m11_128); //m11 + new2 = vqaddq_s16(beta_ptr[1],m10_128); //m10 + new3 = vqsubq_s16(beta_ptr[1],m10_128); //m01 + new4 = vqsubq_s16(beta_ptr[2],m10_128); //m01 + new5 = vqaddq_s16(beta_ptr[2],m10_128); //m10 + new6 = vqaddq_s16(beta_ptr[3],m11_128); //m11 + new7 = vqsubq_s16(beta_ptr[3],m11_128); //m00 + + beta_ptr-=8; + beta_ptr[0] = vmaxq_s16(m_b0,new0); + beta_ptr[1] = vmaxq_s16(m_b1,new1); + beta_ptr[2] = vmaxq_s16(m_b2,new2); + beta_ptr[3] = vmaxq_s16(m_b3,new3); + beta_ptr[4] = vmaxq_s16(m_b4,new4); + beta_ptr[5] = vmaxq_s16(m_b5,new5); + beta_ptr[6] = vmaxq_s16(m_b6,new6); + beta_ptr[7] = vmaxq_s16(m_b7,new7); + + beta_max = vmaxq_s16(beta_ptr[0],beta_ptr[1]); + beta_max = vmaxq_s16(beta_max ,beta_ptr[2]); + beta_max = vmaxq_s16(beta_max ,beta_ptr[3]); + beta_max = vmaxq_s16(beta_max ,beta_ptr[4]); + beta_max = vmaxq_s16(beta_max ,beta_ptr[5]); + beta_max = vmaxq_s16(beta_max ,beta_ptr[6]); + beta_max = vmaxq_s16(beta_max ,beta_ptr[7]); + + beta_ptr[0] = vqsubq_s16(beta_ptr[0],beta_max); + beta_ptr[1] = vqsubq_s16(beta_ptr[1],beta_max); + beta_ptr[2] = vqsubq_s16(beta_ptr[2],beta_max); + beta_ptr[3] = vqsubq_s16(beta_ptr[3],beta_max); + beta_ptr[4] = vqsubq_s16(beta_ptr[4],beta_max); + beta_ptr[5] = vqsubq_s16(beta_ptr[5],beta_max); + beta_ptr[6] = vqsubq_s16(beta_ptr[6],beta_max); + beta_ptr[7] = vqsubq_s16(beta_ptr[7],beta_max); +#endif } @@ -477,6 +694,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,unsigned short frame_length) { +#if defined(__x86_64__) || defined(__i386__) __m128i *alpha128=(__m128i *)alpha; __m128i *beta128=(__m128i *)beta; __m128i *m11_128,*m10_128,*ext_128; @@ -485,6 +703,17 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, __m128i m01_1,m01_2,m01_3,m01_4; __m128i m10_1,m10_2,m10_3,m10_4; __m128i m11_1,m11_2,m11_3,m11_4; +#elif defined(__arm__) + int16x8_t *alpha128=(int16x8_t *)alpha; + int16x8_t *beta128=(int16x8_t *)beta; + int16x8_t *m11_128,*m10_128,*ext_128; + int16x8_t *alpha_ptr,*beta_ptr; + int16x8_t m00_1,m00_2,m00_3,m00_4; + int16x8_t m01_1,m01_2,m01_3,m01_4; + int16x8_t m10_1,m10_2,m10_3,m10_4; + int16x8_t m11_1,m11_2,m11_3,m11_4; +#endif + int k; // @@ -501,9 +730,11 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, for (k=0; k<(frame_length>>3); k++) { +#if defined(__x86_64__) || defined(__i386__) m11_128 = (__m128i*)&m_11[k<<3]; m10_128 = (__m128i*)&m_10[k<<3]; ext_128 = (__m128i*)&ext[k<<3]; + /* printf("EXT %03d\n",k); print_shorts("a0:",&alpha_ptr[0]); @@ -594,7 +825,54 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, print_shorts("m01_1:",&m01_1); print_shorts("syst:",systematic_128); */ - +#elif defined(__arm__) + m11_128 = (int16x8_t*)&m_11[k<<3]; + m10_128 = (int16x8_t*)&m_10[k<<3]; + ext_128 = (int16x8_t*)&ext[k<<3]; + + m00_4 = vqaddq_s16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00; + m11_4 = vqaddq_s16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11; + m00_3 = vqaddq_s16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00; + m11_3 = vqaddq_s16(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11; + m00_2 = vqaddq_s16(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00; + m11_2 = vqaddq_s16(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11; + m11_1 = vqaddq_s16(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11; + m00_1 = vqaddq_s16(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00; + m01_4 = vqaddq_s16(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01; + m10_4 = vqaddq_s16(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10; + m01_3 = vqaddq_s16(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01; + m10_3 = vqaddq_s16(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10; + m01_2 = vqaddq_s16(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01; + m10_2 = vqaddq_s16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10; + m10_1 = vqaddq_s16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10; + m01_1 = vqaddq_s16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01; + + m01_1 = vmaxq_s16(m01_1,m01_2); + m01_1 = vmaxq_s16(m01_1,m01_3); + m01_1 = vmaxq_s16(m01_1,m01_4); + m00_1 = vmaxq_s16(m00_1,m00_2); + m00_1 = vmaxq_s16(m00_1,m00_3); + m00_1 = vmaxq_s16(m00_1,m00_4); + m10_1 = vmaxq_s16(m10_1,m10_2); + m10_1 = vmaxq_s16(m10_1,m10_3); + m10_1 = vmaxq_s16(m10_1,m10_4); + m11_1 = vmaxq_s16(m11_1,m11_2); + m11_1 = vmaxq_s16(m11_1,m11_3); + m11_1 = vmaxq_s16(m11_1,m11_4); + + + m01_1 = vqsubq_s16(m01_1,*m10_128); + m00_1 = vqsubq_s16(m00_1,*m11_128); + m10_1 = vqaddq_s16(m10_1,*m10_128); + m11_1 = vqaddq_s16(m11_1,*m11_128); + + + m01_1 = vmaxq_s16(m01_1,m00_1); + m10_1 = vmaxq_s16(m10_1,m11_1); + + + *ext_128 = vqsubq_s16(m10_1,m01_1); +#endif alpha_ptr+=8; beta_ptr+=8; } @@ -703,15 +981,23 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, int *pi2_p,*pi4_p,*pi5_p,*pi6_p; llr_t *s,*s1,*s2,*yp1,*yp2,*yp; - __m128i *yp128; unsigned int i,j,iind;//,pi; unsigned char iteration_cnt=0; unsigned int crc,oldcrc,crc_len; uint8_t temp; +#if defined(__x86_64__) || defined(__i386__) + __m128i *yp128; __m128i tmp, zeros=_mm_setzero_si128(); - register __m128i tmpe; +#elif defined(__arm__) + int16x8_t *yp128; +// int16x8_t tmp128[(n+8)>>3]; + int16x8_t tmp, zeros=vdupq_n_s16(0); + const uint16_t __attribute__ ((aligned (16))) _Powers[8]= + { 1, 2, 4, 8, 16, 32, 64, 128}; + uint16x8_t Powers= vld1q_u16(_Powers); +#endif int offset8_flag=0; if (crc_type > 3) { @@ -749,7 +1035,11 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, crc_len=3; } +#if defined(__x86_64__) || defined(__i386__) yp128 = (__m128i*)y; +#elif defined(__arm__) + yp128 = (int16x8_t*)y; +#endif @@ -767,7 +1057,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, j=pi2_p[0]; - +#if defined(__x86_64__) || defined(__i386__) tmpe = _mm_load_si128(yp128); s[j] = _mm_extract_epi16(tmpe,0); @@ -826,6 +1116,46 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, yp2[j] = _mm_extract_epi16(tmpe,7); // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); +#elif defined(__arm__) + s[j] = vgetq_lane_s16(yp128[0],0); + yp1[j] = vgetq_lane_s16(yp128[0],1); + yp2[j] = vgetq_lane_s16(yp128[0],2); + + j=pi2_p[1]; + s[j] = vgetq_lane_s16(yp128[0],3); + yp1[j] = vgetq_lane_s16(yp128[0],4); + yp2[j] = vgetq_lane_s16(yp128[0],5); + + j=pi2_p[2]; + s[j] = vgetq_lane_s16(yp128[0],6); + yp1[j] = vgetq_lane_s16(yp128[0],7); + yp2[j] = vgetq_lane_s16(yp128[1],0); + + j=pi2_p[3]; + s[j] = vgetq_lane_s16(yp128[1],1); + yp1[j] = vgetq_lane_s16(yp128[1],2); + yp2[j] = vgetq_lane_s16(yp128[1],3); + + j=pi2_p[4]; + s[j] = vgetq_lane_s16(yp128[1],4); + yp1[j] = vgetq_lane_s16(yp128[1],5); + yp2[j] = vgetq_lane_s16(yp128[1],6); + + j=pi2_p[5]; + s[j] = vgetq_lane_s16(yp128[1],7); + yp1[j] = vgetq_lane_s16(yp128[2],0); + yp2[j] = vgetq_lane_s16(yp128[2],1); + + j=pi2_p[6]; + s[j] = vgetq_lane_s16(yp128[2],2); + yp1[j] = vgetq_lane_s16(yp128[2],3); + yp2[j] = vgetq_lane_s16(yp128[2],4); + + j=pi2_p[7]; + s[j] = vgetq_lane_s16(yp128[2],5); + yp1[j] = vgetq_lane_s16(yp128[2],6); + yp2[j] = vgetq_lane_s16(yp128[2],7); +#endif yp128+=3; } @@ -879,7 +1209,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, pi4_p=pi4tab16[iind]; for (i=0; i<(n>>3); i++) { // steady-state portion - +#if defined(__x86_64__) || defined(__i386__) ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],0); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],1); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],2); @@ -888,6 +1218,16 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],5); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],6); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],7); +#elif defined(__arm__) + ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],0); + ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],1); + ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],2); + ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],3); + ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],4); + ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],5); + ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],6); + ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],7); +#endif } stop_meas(intl1_stats); @@ -901,6 +1241,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, pi5_p=pi5tab16[iind]; for (i=0; i<(n>>3); i++) { +#if defined(__x86_64__) || defined(__i386__) tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],0); tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],1); tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],2); @@ -910,6 +1251,17 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],6); tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],7); ((__m128i *)systematic1)[i] = _mm_adds_epi16(_mm_subs_epi16(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]); +#elif defined(__arm__) + tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,0); + tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,1); + tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,2); + tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,3); + tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,4); + tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,5); + tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,6); + tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,7); + ((int16x8_t *)systematic1)[i] = vqaddq_s16(vqsubq_s16(tmp,((int16x8_t*)ext)[i]),((int16x8_t *)systematic0)[i]); +#endif } if (iteration_cnt>1) { @@ -917,6 +1269,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, pi6_p=pi6tab16[iind]; for (i=0; i<(n>>3); i++) { +#if defined(__x86_64__) || defined(__i386__) tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],7); tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],6); tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],5); @@ -927,7 +1280,24 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],0); tmp=_mm_cmpgt_epi8(_mm_packs_epi16(tmp,zeros),zeros); decoded_bytes[i]=(unsigned char)_mm_movemask_epi8(tmp); - +#elif defined(__arm__) + tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,7); + tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,6); + tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,5); + tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,4); + tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,3); + tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,2); + tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,1); + tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,0); +// This does: +// [1 2 4 8 16 32 64 128] .* I(ext_i > 0) = 2.^[b0 b1 b2 b3 b4 b5 b6 b7], where bi =I(ext_i > 0) +// [2^b0 + 2^b1 2^b2 + 2^b3 2^b4 + 2^b5 2^b6 + 2^b7] +// [2^b0 + 2^b1 + 2^b2 + 2^b3 2^b4 + 2^b5 + 2^b6 + 2^b7] +// Mask64 = 2^b0 + 2^b1 + 2^b2 + 2^b3 + 2^b4 + 2^b5 + 2^b6 + 2^b7 + uint64x2_t Mask = vpaddlq_u32(vpaddlq_u16(vandq_u16(vcgtq_s16(tmp,zeros), Powers))); + uint64x1_t Mask64 = vget_high_u64(Mask)+vget_low_u64(Mask); + decoded_bytes[i] = (uint8_t)Mask64; +#endif } } @@ -983,14 +1353,23 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, // do log_map from first parity bit if (iteration_cnt < max_iterations) { log_map16(systematic1,yparity1,m11,m10,alpha,beta,ext,n,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats); +#if defined(__x86_64__) || defined(__i386__) __m128i* ext_128=(__m128i*) ext; __m128i* s1_128=(__m128i*) systematic1; __m128i* s0_128=(__m128i*) systematic0; - +#elif defined(__arm__) + int16x8_t* ext_128=(int16x8_t*) ext; + int16x8_t* s1_128=(int16x8_t*) systematic1; + int16x8_t* s0_128=(int16x8_t*) systematic0; +#endif int myloop=n>>3; for (i=0; i<myloop; i++) { +#if defined(__x86_64__) || defined(__i386__) *ext_128=_mm_adds_epi16(_mm_subs_epi16(*ext_128,*s1_128++),*s0_128++); +#elif defined(__arm__) + *ext_128=vqaddq_s16(vqsubq_s16(*ext_128,*s1_128++),*s0_128++); +#endif ext_128++; } } @@ -998,8 +1377,10 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, // printf("crc %x, oldcrc %x\n",crc,oldcrc); +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif return(iteration_cnt); } diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c index fecfe8fa7f..806af15086 100644 --- a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c +++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c @@ -100,14 +100,13 @@ void compute_beta8(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, unsigned shor void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, unsigned short frame_length); -void print_bytes(char *s, __m128i *x) +void print_bytes(char *s, int8_t *x) { - int8_t *tempb = (int8_t *)x; printf("%s : %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s, - tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7], - tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]); + x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7], + x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15]); } @@ -153,32 +152,47 @@ void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity, unsigned short frame_length,unsigned char term_flag) { int k,K1; +#if defined(__x86_64__)||defined(__i386__) __m128i *systematic128 = (__m128i *)systematic; __m128i *y_parity128 = (__m128i *)y_parity; __m128i *m10_128 = (__m128i *)m10; __m128i *m11_128 = (__m128i *)m11; +#elif defined(__arm__) + int8x16_t *systematic128 = (int8x16_t *)systematic; + int8x16_t *y_parity128 = (int8x16_t *)y_parity; + int8x16_t *m10_128 = (int8x16_t *)m10; + int8x16_t *m11_128 = (int8x16_t *)m11; +#endif #ifdef DEBUG_LOGMAP msg("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length); #endif +#if defined(__x86_64__) || defined(__i386__) register __m128i sl,sh,ypl,yph; //K128=_mm_set1_epi8(-128); +#endif K1 = (frame_length>>4); for (k=0; k<K1; k++) { +#if defined(__x86_64__) || defined(__i386__) sl = _mm_cvtepi8_epi16(systematic128[k]); - sh = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8)); + sh = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8)); ypl = _mm_cvtepi8_epi16(y_parity128[k]); yph = _mm_cvtepi8_epi16(_mm_srli_si128(y_parity128[k],8)); m11_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(sl,ypl),1), _mm_srai_epi16(_mm_adds_epi16(sh,yph),1)); m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1), _mm_srai_epi16(_mm_subs_epi16(sh,yph),1)); +#elif defined(__arm__) + m11_128[k] = vhaddq_s8(systematic128[k],y_parity128[k]); + m10_128[k] = vhsubq_s8(systematic128[k],y_parity128[k]); +#endif } // Termination +#if defined(__x86_64__) || defined(__i386__) sl = _mm_cvtepi8_epi16(systematic128[k+term_flag]); sh = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8)); ypl = _mm_cvtepi8_epi16(y_parity128[k+term_flag]); @@ -187,7 +201,10 @@ void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity, _mm_srai_epi16(_mm_adds_epi16(sh,yph),1)); m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1), _mm_srai_epi16(_mm_subs_epi16(sh,yph),1)); - +#elif defined(__arm__) + m11_128[k] = vhaddq_s8(systematic128[k+term_flag],y_parity128[k]); + m10_128[k] = vhsubq_s8(systematic128[k+term_flag],y_parity128[k]); +#endif } @@ -196,14 +213,24 @@ void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity, void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned short frame_length,unsigned char F) { int k,loopval,rerun_flag; + +#if defined(__x86_64__) || defined(__i386__) __m128i *alpha128=(__m128i *)alpha,*alpha_ptr; __m128i *m11p,*m10p; __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; __m128i new0,new1,new2,new3,new4,new5,new6,new7; __m128i alpha_max; - +#elif defined(__arm__) + int8x16_t *alpha128=(int8x16_t *)alpha,*alpha_ptr; + int8x16_t *m11p,*m10p; + int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; + int8x16_t new0,new1,new2,new3,new4,new5,new6,new7; + int8x16_t alpha_max; +#endif // Set initial state: first colum is known // the other columns are unknown, so all states are set to same value + +#if defined(__x86_64__) || defined(__i386__) alpha128[0] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,0); alpha128[1] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2); alpha128[2] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2); @@ -212,10 +239,10 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh alpha128[5] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2); alpha128[6] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2); alpha128[7] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2); - for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) { alpha_ptr = &alpha128[0]; + m11p = (__m128i*)m_11; m10p = (__m128i*)m_10; @@ -289,6 +316,95 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh alpha[112] = -MAX8/2; } +#elif defined(__arm__) + alpha128[0] = vdupq_n_s8(-MAX8/2); + alpha128[0] = vsetq_lane_s8(0,alpha128[0],0); + alpha128[1] = vdupq_n_s8(-MAX8/2); + alpha128[2] = vdupq_n_s8(-MAX8/2); + alpha128[3] = vdupq_n_s8(-MAX8/2); + alpha128[4] = vdupq_n_s8(-MAX8/2); + alpha128[5] = vdupq_n_s8(-MAX8/2); + alpha128[6] = vdupq_n_s8(-MAX8/2); + alpha128[7] = vdupq_n_s8(-MAX8/2); + for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) { + + alpha_ptr = &alpha128[0]; + + m11p = (int8x16_t*)m_11; + m10p = (int8x16_t*)m_10; + + for (k=0; k<loopval; k++) { + m_b0 = vqaddq_s8(alpha_ptr[1],*m11p); // m11 + m_b4 = vqsubq_s8(alpha_ptr[1],*m11p); // m00=-m11 + m_b1 = vqsubq_s8(alpha_ptr[3],*m10p); // m01=-m10 + m_b5 = vqaddq_s8(alpha_ptr[3],*m10p); // m10 + m_b2 = vqaddq_s8(alpha_ptr[5],*m10p); // m10 + m_b6 = vqsubq_s8(alpha_ptr[5],*m10p); // m01=-m10 + m_b3 = vqsubq_s8(alpha_ptr[7],*m11p); // m00=-m11 + m_b7 = vqaddq_s8(alpha_ptr[7],*m11p); // m11 + + new0 = vqsubq_s8(alpha_ptr[0],*m11p); // m00=-m11 + new4 = vqaddq_s8(alpha_ptr[0],*m11p); // m11 + new1 = vqaddq_s8(alpha_ptr[2],*m10p); // m10 + new5 = vqsubq_s8(alpha_ptr[2],*m10p); // m01=-m10 + new2 = vqsubq_s8(alpha_ptr[4],*m10p); // m01=-m10 + new6 = vqaddq_s8(alpha_ptr[4],*m10p); // m10 + new3 = vqaddq_s8(alpha_ptr[6],*m11p); // m11 + new7 = vqsubq_s8(alpha_ptr[6],*m11p); // m00=-m11 + + alpha_ptr += 8; + m11p++; + m10p++; + alpha_ptr[0] = vmaxq_s8(m_b0,new0); + alpha_ptr[1] = vmaxq_s8(m_b1,new1); + alpha_ptr[2] = vmaxq_s8(m_b2,new2); + alpha_ptr[3] = vmaxq_s8(m_b3,new3); + alpha_ptr[4] = vmaxq_s8(m_b4,new4); + alpha_ptr[5] = vmaxq_s8(m_b5,new5); + alpha_ptr[6] = vmaxq_s8(m_b6,new6); + alpha_ptr[7] = vmaxq_s8(m_b7,new7); + + // compute and subtract maxima + alpha_max = vmaxq_s8(alpha_ptr[0],alpha_ptr[1]); + alpha_max = vmaxq_s8(alpha_max,alpha_ptr[2]); + alpha_max = vmaxq_s8(alpha_max,alpha_ptr[3]); + alpha_max = vmaxq_s8(alpha_max,alpha_ptr[4]); + alpha_max = vmaxq_s8(alpha_max,alpha_ptr[5]); + alpha_max = vmaxq_s8(alpha_max,alpha_ptr[6]); + alpha_max = vmaxq_s8(alpha_max,alpha_ptr[7]); + + alpha_ptr[0] = vqsubq_s8(alpha_ptr[0],alpha_max); + alpha_ptr[1] = vqsubq_s8(alpha_ptr[1],alpha_max); + alpha_ptr[2] = vqsubq_s8(alpha_ptr[2],alpha_max); + alpha_ptr[3] = vqsubq_s8(alpha_ptr[3],alpha_max); + alpha_ptr[4] = vqsubq_s8(alpha_ptr[4],alpha_max); + alpha_ptr[5] = vqsubq_s8(alpha_ptr[5],alpha_max); + alpha_ptr[6] = vqsubq_s8(alpha_ptr[6],alpha_max); + alpha_ptr[7] = vqsubq_s8(alpha_ptr[7],alpha_max); + } + + // Set intial state for next iteration from the last state + // as a column end states are the first states of the next column + int K1= frame_length>>1; + alpha128[0] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[K1],8); alpha128[0] = vsetq_lane_s8(alpha[8],alpha128[0],7); + alpha128[1] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[1+K1],8); alpha128[1] = vsetq_lane_s8(alpha[24],alpha128[0],7); + alpha128[2] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[2+K1],8); alpha128[2] = vsetq_lane_s8(alpha[40],alpha128[0],7); + alpha128[3] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[3+K1],8); alpha128[3] = vsetq_lane_s8(alpha[56],alpha128[0],7); + alpha128[4] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[4+K1],8); alpha128[4] = vsetq_lane_s8(alpha[72],alpha128[0],7); + alpha128[5] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[5+K1],8); alpha128[5] = vsetq_lane_s8(alpha[88],alpha128[0],7); + alpha128[6] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[6+K1],8); alpha128[6] = vsetq_lane_s8(alpha[104],alpha128[0],7); + alpha128[7] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[7+K1],8); alpha128[7] = vsetq_lane_s8(alpha[120],alpha128[0],7); + alpha[16] = -MAX8/2; + alpha[32] = -MAX8/2; + alpha[48] = -MAX8/2; + alpha[64] = -MAX8/2; + alpha[80] = -MAX8/2; + alpha[96] = -MAX8/2; + alpha[112] = -MAX8/2; + + } +#endif + } @@ -297,13 +413,21 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho { int k,rerun_flag, loopval; +#if defined(__x86_64__) || defined(__i386__) __m128i m11_128,m10_128; __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; __m128i new0,new1,new2,new3,new4,new5,new6,new7; __m128i *beta128,*alpha128,*beta_ptr; __m128i beta_max; +#elif defined(__arm__) + int8x16_t m11_128,m10_128; + int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; + int8x16_t new0,new1,new2,new3,new4,new5,new6,new7; + int8x16_t *beta128,*alpha128,*beta_ptr; + int8x16_t beta_max; +#endif llr_t beta0,beta1; llr_t beta2,beta3,beta4,beta5,beta6,beta7; @@ -371,8 +495,14 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho // we are supposed to run compute_alpha just before compute_beta // so the initial states of backward computation can be set from last value of alpha states (forward computation) + +#if defined(__x86_64__) || defined(__i386__) beta_ptr = (__m128i*)&beta[frame_length<<3]; alpha128 = (__m128i*)&alpha[0]; +#elif defined(__arm__) + beta_ptr = (int8x16_t*)&beta[frame_length<<3]; + alpha128 = (int8x16_t*)&alpha[0]; +#endif beta_ptr[0] = alpha128[(frame_length>>1)]; beta_ptr[1] = alpha128[1+(frame_length>>1)]; beta_ptr[2] = alpha128[2+(frame_length>>1)]; @@ -391,6 +521,7 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho // workaround: init with 0 beta0 = beta1 = beta2 = beta3 = beta4 = beta5 = beta6 = beta7 = 0; +#if defined(__x86_64__) || defined(__i386__) beta_ptr[0] = _mm_insert_epi8(beta_ptr[0],beta0,15); beta_ptr[1] = _mm_insert_epi8(beta_ptr[1],beta1,15); beta_ptr[2] = _mm_insert_epi8(beta_ptr[2],beta2,15); @@ -399,12 +530,27 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho beta_ptr[5] = _mm_insert_epi8(beta_ptr[5],beta5,15); beta_ptr[6] = _mm_insert_epi8(beta_ptr[6],beta6,15); beta_ptr[7] = _mm_insert_epi8(beta_ptr[7],beta7,15); +#elif defined(__arm__) + beta_ptr[0] = vsetq_lane_s8(beta0,beta_ptr[0],15); + beta_ptr[1] = vsetq_lane_s8(beta1,beta_ptr[1],15); + beta_ptr[2] = vsetq_lane_s8(beta2,beta_ptr[2],15); + beta_ptr[3] = vsetq_lane_s8(beta3,beta_ptr[3],15); + beta_ptr[4] = vsetq_lane_s8(beta4,beta_ptr[4],15); + beta_ptr[5] = vsetq_lane_s8(beta5,beta_ptr[5],15); + beta_ptr[6] = vsetq_lane_s8(beta6,beta_ptr[6],15); + beta_ptr[7] = vsetq_lane_s8(beta7,beta_ptr[7],15); +#endif } - for (k=(frame_length>>4)-1, beta_ptr = (__m128i*)&beta[frame_length<<3] ; +#if defined(__x86_64__) || defined(__i386__) + beta_ptr = (__m128i*)&beta[frame_length<<3]; +#elif defined(__arm__) + beta_ptr = (int8x16_t*)&beta[frame_length<<3]; +#endif + for (k=(frame_length>>4)-1; k>=loopval; k--) { - +#if defined(__x86_64__) || defined(__i386__) m11_128=((__m128i*)m_11)[k]; m10_128=((__m128i*)m_10)[k]; m_b0 = _mm_adds_epi8(beta_ptr[4],m11_128); //m11 @@ -452,12 +598,62 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho beta_ptr[5] = _mm_subs_epi8(beta_ptr[5],beta_max); beta_ptr[6] = _mm_subs_epi8(beta_ptr[6],beta_max); beta_ptr[7] = _mm_subs_epi8(beta_ptr[7],beta_max); +#elif defined(__arm__) + m11_128=((int8x16_t*)m_11)[k]; + m10_128=((int8x16_t*)m_10)[k]; + m_b0 = vqaddq_s8(beta_ptr[4],m11_128); //m11 + m_b1 = vqsubq_s8(beta_ptr[4],m11_128); //m00 + m_b2 = vqsubq_s8(beta_ptr[5],m10_128); //m01 + m_b3 = vqaddq_s8(beta_ptr[5],m10_128); //m10 + m_b4 = vqaddq_s8(beta_ptr[6],m10_128); //m10 + m_b5 = vqsubq_s8(beta_ptr[6],m10_128); //m01 + m_b6 = vqsubq_s8(beta_ptr[7],m11_128); //m00 + m_b7 = vqaddq_s8(beta_ptr[7],m11_128); //m11 + + new0 = vqsubq_s8(beta_ptr[0],m11_128); //m00 + new1 = vqaddq_s8(beta_ptr[0],m11_128); //m11 + new2 = vqaddq_s8(beta_ptr[1],m10_128); //m10 + new3 = vqsubq_s8(beta_ptr[1],m10_128); //m01 + new4 = vqsubq_s8(beta_ptr[2],m10_128); //m01 + new5 = vqaddq_s8(beta_ptr[2],m10_128); //m10 + new6 = vqaddq_s8(beta_ptr[3],m11_128); //m11 + new7 = vqsubq_s8(beta_ptr[3],m11_128); //m00 + + beta_ptr-=8; + beta_ptr[0] = vmaxq_s8(m_b0,new0); + beta_ptr[1] = vmaxq_s8(m_b1,new1); + beta_ptr[2] = vmaxq_s8(m_b2,new2); + beta_ptr[3] = vmaxq_s8(m_b3,new3); + beta_ptr[4] = vmaxq_s8(m_b4,new4); + beta_ptr[5] = vmaxq_s8(m_b5,new5); + beta_ptr[6] = vmaxq_s8(m_b6,new6); + beta_ptr[7] = vmaxq_s8(m_b7,new7); + + beta_max = vmaxq_s8(beta_ptr[0],beta_ptr[1]); + beta_max = vmaxq_s8(beta_max ,beta_ptr[2]); + beta_max = vmaxq_s8(beta_max ,beta_ptr[3]); + beta_max = vmaxq_s8(beta_max ,beta_ptr[4]); + beta_max = vmaxq_s8(beta_max ,beta_ptr[5]); + beta_max = vmaxq_s8(beta_max ,beta_ptr[6]); + beta_max = vmaxq_s8(beta_max ,beta_ptr[7]); + + beta_ptr[0] = vqsubq_s8(beta_ptr[0],beta_max); + beta_ptr[1] = vqsubq_s8(beta_ptr[1],beta_max); + beta_ptr[2] = vqsubq_s8(beta_ptr[2],beta_max); + beta_ptr[3] = vqsubq_s8(beta_ptr[3],beta_max); + beta_ptr[4] = vqsubq_s8(beta_ptr[4],beta_max); + beta_ptr[5] = vqsubq_s8(beta_ptr[5],beta_max); + beta_ptr[6] = vqsubq_s8(beta_ptr[6],beta_max); + beta_ptr[7] = vqsubq_s8(beta_ptr[7],beta_max); +#endif } // Set intial state for next iteration from the last state // as column last states are the first states of the next column - // The initial state of colum 0 is coming from tail bits (to be computed) + // The initial state of column 0 is coming from tail bits (to be computed) + +#if defined(__x86_64__) || defined(__i386__) beta128 = (__m128i*)&beta[0]; beta_ptr = (__m128i*)&beta[frame_length<<3]; beta_ptr[0] = _mm_srli_si128(beta128[0],1); @@ -468,12 +664,25 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho beta_ptr[5] = _mm_srli_si128(beta128[5],1); beta_ptr[6] = _mm_srli_si128(beta128[6],1); beta_ptr[7] = _mm_srli_si128(beta128[7],1); +#elif defined(__arm__) + beta128 = (int8x16_t*)&beta[0]; + beta_ptr = (int8x16_t*)&beta[frame_length<<3]; + beta_ptr[0] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[0],8); beta_ptr[0] = vsetq_lane_s8(beta[7],beta_ptr[0],8); + beta_ptr[1] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[1],8); beta_ptr[1] = vsetq_lane_s8(beta[23],beta_ptr[1],8); + beta_ptr[2] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[2],8); beta_ptr[2] = vsetq_lane_s8(beta[39],beta_ptr[2],8); + beta_ptr[3] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[3],8); beta_ptr[3] = vsetq_lane_s8(beta[55],beta_ptr[3],8); + beta_ptr[4] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[4],8); beta_ptr[4] = vsetq_lane_s8(beta[71],beta_ptr[4],8); + beta_ptr[5] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[5],8); beta_ptr[5] = vsetq_lane_s8(beta[87],beta_ptr[5],8); + beta_ptr[6] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[6],8); beta_ptr[6] = vsetq_lane_s8(beta[103],beta_ptr[6],8); + beta_ptr[7] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[7],8); beta_ptr[7] = vsetq_lane_s8(beta[119],beta_ptr[7],8); +#endif } } void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,unsigned short frame_length) { +#if defined(__x86_64__) || defined(__i386__) __m128i *alpha128=(__m128i *)alpha; __m128i *beta128=(__m128i *)beta; __m128i *m11_128,*m10_128,*ext_128; @@ -482,6 +691,16 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l __m128i m01_1,m01_2,m01_3,m01_4; __m128i m10_1,m10_2,m10_3,m10_4; __m128i m11_1,m11_2,m11_3,m11_4; +#elif defined(__arm__) + int8x16_t *alpha128=(int8x16_t *)alpha; + int8x16_t *beta128=(int8x16_t *)beta; + int8x16_t *m11_128,*m10_128,*ext_128; + int8x16_t *alpha_ptr,*beta_ptr; + int8x16_t m00_1,m00_2,m00_3,m00_4; + int8x16_t m01_1,m01_2,m01_3,m01_4; + int8x16_t m10_1,m10_2,m10_3,m10_4; + int8x16_t m11_1,m11_2,m11_3,m11_4; +#endif int k; // @@ -498,6 +717,8 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l for (k=0; k<(frame_length>>4); k++) { +#if defined(__x86_64__) || defined(__i386__) + m11_128 = (__m128i*)&m_11[k<<4]; m10_128 = (__m128i*)&m_10[k<<4]; ext_128 = (__m128i*)&ext[k<<4]; @@ -547,6 +768,59 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l alpha_ptr+=8; beta_ptr+=8; +#elif defined(__arm__) + + m11_128 = (int8x16_t*)&m_11[k<<4]; + m10_128 = (int8x16_t*)&m_10[k<<4]; + ext_128 = (int8x16_t*)&ext[k<<4]; + + m00_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00; + m11_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11; + m00_3 = vqaddq_s8(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00; + m11_3 = vqaddq_s8(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11; + m00_2 = vqaddq_s8(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00; + m11_2 = vqaddq_s8(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11; + m11_1 = vqaddq_s8(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11; + m00_1 = vqaddq_s8(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00; + m01_4 = vqaddq_s8(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01; + m10_4 = vqaddq_s8(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10; + m01_3 = vqaddq_s8(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01; + m10_3 = vqaddq_s8(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10; + m01_2 = vqaddq_s8(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01; + m10_2 = vqaddq_s8(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10; + m10_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10; + m01_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01; + + m01_1 = vmaxq_s8(m01_1,m01_2); + m01_1 = vmaxq_s8(m01_1,m01_3); + m01_1 = vmaxq_s8(m01_1,m01_4); + m00_1 = vmaxq_s8(m00_1,m00_2); + m00_1 = vmaxq_s8(m00_1,m00_3); + m00_1 = vmaxq_s8(m00_1,m00_4); + m10_1 = vmaxq_s8(m10_1,m10_2); + m10_1 = vmaxq_s8(m10_1,m10_3); + m10_1 = vmaxq_s8(m10_1,m10_4); + m11_1 = vmaxq_s8(m11_1,m11_2); + m11_1 = vmaxq_s8(m11_1,m11_3); + m11_1 = vmaxq_s8(m11_1,m11_4); + + + m01_1 = vqsubq_s8(m01_1,*m10_128); + m00_1 = vqsubq_s8(m00_1,*m11_128); + m10_1 = vqaddq_s8(m10_1,*m10_128); + m11_1 = vqaddq_s8(m11_1,*m11_128); + + + m01_1 = vmaxq_s8(m01_1,m00_1); + m10_1 = vmaxq_s8(m10_1,m11_1); + + + *ext_128 = vqsubq_s8(m10_1,m01_1); + + alpha_ptr+=8; + beta_ptr+=8; + +#endif } @@ -661,14 +935,25 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, // int *pi2_p,*pi4_p,*pi5_p,*pi6_p; int *pi4_p,*pi5_p,*pi6_p; llr_t *s,*s1,*s2,*yp1,*yp2,*yp; - __m128i *yp128; + unsigned int i,j,iind;//,pi; unsigned char iteration_cnt=0; unsigned int crc,oldcrc,crc_len; uint8_t temp; +#if defined(__x86_64__) || defined(__i386__) + __m128i *yp128; __m128i tmp128[(n+8)>>3]; __m128i tmp, zeros=_mm_setzero_si128(); - +#elif defined(__arm__) + int8x16_t *yp128; + int8x16_t tmp128[(n+8)>>3]; + int8x16_t tmp, zeros=vdupq_n_s8(0); + const uint8_t __attribute__ ((aligned (16))) _Powers[16]= + { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + + // Set the powers of 2 (do it once for all, if applicable) + uint8x16_t Powers= vld1q_u8(_Powers); +#endif int offset8_flag=0; @@ -713,6 +998,8 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, crc_len=3; } +#if defined(__x86_64__) || defined(__i386__) + __m128i avg=_mm_set1_epi32(0); for (i=0; i<(3*(n>>4))+1; i++) { @@ -721,7 +1008,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, avg=_mm_add_epi32(_mm_cvtepi16_epi32(tmp),avg); } - int round_avg=(_mm_extract_epi32(avg,0)+_mm_extract_epi32(avg,1)+_mm_extract_epi32(avg,2)+_mm_extract_epi32(avg,3))/(n*3); + int32_t round_avg=(_mm_extract_epi32(avg,0)+_mm_extract_epi32(avg,1)+_mm_extract_epi32(avg,2)+_mm_extract_epi32(avg,3))/(n*3); //printf("avg input turbo: %d sum %d taille bloc %d\n",round_avg,round_sum,n); @@ -740,6 +1027,35 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, yp128 = (__m128i*)y8; +#elif defined(__arm__) + + int32x4_t avg=vdupq_n_s32(0); + + for (i=0; i<(3*(n>>4))+1; i++) { + int16x8_t tmp=vabsq_s16(((int16x8_t*)y)[i]); + avg = vqaddq_s32(avg,vaddl_s16(((int16x4_t*)&tmp)[0],((int16x4_t*)&tmp)[1])); + } + + int32_t round_avg=(vgetq_lane_s32(avg,0)+vgetq_lane_s32(avg,1)+vgetq_lane_s32(avg,2)+vgetq_lane_s32(avg,3))/(n*3); + + //printf("avg input turbo: %d sum %d taille bloc %d\n",round_avg,round_sum,n); + + if (round_avg < 16 ) + for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2) + ((int8x8_t *)y8)[i] = vqmovn_s16(((int16x8_t *)y)[j]); + else if (round_avg < 32) + for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2) + ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],1)); + else if (round_avg < 64 ) + for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2) + ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],2)); + else + for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2) + ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],3)); + + yp128 = (int8x16_t*)y8; + +#endif s = systematic0; s1 = systematic1; @@ -764,101 +1080,198 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, pi2_p = &pi2tab8[iind][i]; j=pi2_p[0]; +#if defined(__x86_64__) || defined(__i386__) s[j] = _mm_extract_epi8(yp128[0],0); yp1[j] = _mm_extract_epi8(yp128[0],1); yp2[j] = _mm_extract_epi8(yp128[0],2); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[1]; s[j] = _mm_extract_epi8(yp128[0],3); yp1[j] = _mm_extract_epi8(yp128[0],4); yp2[j] = _mm_extract_epi8(yp128[0],5); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[2]; s[j] = _mm_extract_epi8(yp128[0],6); yp1[j] = _mm_extract_epi8(yp128[0],7); yp2[j] = _mm_extract_epi8(yp128[0],8); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[3]; s[j] = _mm_extract_epi8(yp128[0],9); yp1[j] = _mm_extract_epi8(yp128[0],10); yp2[j] = _mm_extract_epi8(yp128[0],11); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[4]; s[j] = _mm_extract_epi8(yp128[0],12); yp1[j] = _mm_extract_epi8(yp128[0],13); yp2[j] = _mm_extract_epi8(yp128[0],14); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[5]; s[j] = _mm_extract_epi8(yp128[0],15); yp1[j] = _mm_extract_epi8(yp128[1],0); yp2[j] = _mm_extract_epi8(yp128[1],1); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[6]; s[j] = _mm_extract_epi8(yp128[1],2); yp1[j] = _mm_extract_epi8(yp128[1],3); yp2[j] = _mm_extract_epi8(yp128[1],4); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[7]; s[j] = _mm_extract_epi8(yp128[1],5); yp1[j] = _mm_extract_epi8(yp128[1],6); yp2[j] = _mm_extract_epi8(yp128[1],7); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[8]; s[j] = _mm_extract_epi8(yp128[1],8); yp1[j] = _mm_extract_epi8(yp128[1],9); yp2[j] = _mm_extract_epi8(yp128[1],10); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[9]; s[j] = _mm_extract_epi8(yp128[1],11); yp1[j] = _mm_extract_epi8(yp128[1],12); yp2[j] = _mm_extract_epi8(yp128[1],13); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[10]; s[j] = _mm_extract_epi8(yp128[1],14); yp1[j] = _mm_extract_epi8(yp128[1],15); yp2[j] = _mm_extract_epi8(yp128[2],0); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[11]; s[j] = _mm_extract_epi8(yp128[2],1); yp1[j] = _mm_extract_epi8(yp128[2],2); yp2[j] = _mm_extract_epi8(yp128[2],3); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[12]; s[j] = _mm_extract_epi8(yp128[2],4); yp1[j] = _mm_extract_epi8(yp128[2],5); yp2[j] = _mm_extract_epi8(yp128[2],6); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[13]; s[j] = _mm_extract_epi8(yp128[2],7); yp1[j] = _mm_extract_epi8(yp128[2],8); yp2[j] = _mm_extract_epi8(yp128[2],9); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[14]; s[j] = _mm_extract_epi8(yp128[2],10); yp1[j] = _mm_extract_epi8(yp128[2],11); yp2[j] = _mm_extract_epi8(yp128[2],12); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + j=pi2_p[15]; s[j] = _mm_extract_epi8(yp128[2],13); yp1[j] = _mm_extract_epi8(yp128[2],14); yp2[j] = _mm_extract_epi8(yp128[2],15); - // printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); + +#elif defined(__arm__) + s[j] = vgetq_lane_s8(yp128[0],0); + yp1[j] = vgetq_lane_s8(yp128[0],1); + yp2[j] = vgetq_lane_s8(yp128[0],2); + + + j=pi2_p[1]; + s[j] = vgetq_lane_s8(yp128[0],3); + yp1[j] = vgetq_lane_s8(yp128[0],4); + yp2[j] = vgetq_lane_s8(yp128[0],5); + + + j=pi2_p[2]; + s[j] = vgetq_lane_s8(yp128[0],6); + yp1[j] = vgetq_lane_s8(yp128[0],7); + yp2[j] = vgetq_lane_s8(yp128[0],8); + + + j=pi2_p[3]; + s[j] = vgetq_lane_s8(yp128[0],9); + yp1[j] = vgetq_lane_s8(yp128[0],10); + yp2[j] = vgetq_lane_s8(yp128[0],11); + + + j=pi2_p[4]; + s[j] = vgetq_lane_s8(yp128[0],12); + yp1[j] = vgetq_lane_s8(yp128[0],13); + yp2[j] = vgetq_lane_s8(yp128[0],14); + + + j=pi2_p[5]; + s[j] = vgetq_lane_s8(yp128[0],15); + yp1[j] = vgetq_lane_s8(yp128[1],0); + yp2[j] = vgetq_lane_s8(yp128[1],1); + + + j=pi2_p[6]; + s[j] = vgetq_lane_s8(yp128[1],2); + yp1[j] = vgetq_lane_s8(yp128[1],3); + yp2[j] = vgetq_lane_s8(yp128[1],4); + + + j=pi2_p[7]; + s[j] = vgetq_lane_s8(yp128[1],5); + yp1[j] = vgetq_lane_s8(yp128[1],6); + yp2[j] = vgetq_lane_s8(yp128[1],7); + + + j=pi2_p[8]; + s[j] = vgetq_lane_s8(yp128[1],8); + yp1[j] = vgetq_lane_s8(yp128[1],9); + yp2[j] = vgetq_lane_s8(yp128[1],10); + + + j=pi2_p[9]; + s[j] = vgetq_lane_s8(yp128[1],11); + yp1[j] = vgetq_lane_s8(yp128[1],12); + yp2[j] = vgetq_lane_s8(yp128[1],13); + + + j=pi2_p[10]; + s[j] = vgetq_lane_s8(yp128[1],14); + yp1[j] = vgetq_lane_s8(yp128[1],15); + yp2[j] = vgetq_lane_s8(yp128[2],0); + + + j=pi2_p[11]; + s[j] = vgetq_lane_s8(yp128[2],1); + yp1[j] = vgetq_lane_s8(yp128[2],2); + yp2[j] = vgetq_lane_s8(yp128[2],3); + + + j=pi2_p[12]; + s[j] = vgetq_lane_s8(yp128[2],4); + yp1[j] = vgetq_lane_s8(yp128[2],5); + yp2[j] = vgetq_lane_s8(yp128[2],6); + + + j=pi2_p[13]; + s[j] = vgetq_lane_s8(yp128[2],7); + yp1[j] = vgetq_lane_s8(yp128[2],8); + yp2[j] = vgetq_lane_s8(yp128[2],9); + + + j=pi2_p[14]; + s[j] = vgetq_lane_s8(yp128[2],10); + yp1[j] = vgetq_lane_s8(yp128[2],11); + yp2[j] = vgetq_lane_s8(yp128[2],12); + + + j=pi2_p[15]; + s[j] = vgetq_lane_s8(yp128[2],13); + yp1[j] = vgetq_lane_s8(yp128[2],14); + yp2[j] = vgetq_lane_s8(yp128[2],15); + +#endif yp128+=3; } @@ -925,6 +1338,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, pi4_p=pi4tab8[iind]; for (i=0; i<(n2>>4); i++) { // steady-state portion +#if defined(__x86_64__) || defined(__i386__) tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],0); tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],1); tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],2); @@ -941,6 +1355,24 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],13); tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],14); ((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],15); +#elif defined(__arm__) + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,0); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,1); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,2); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,3); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,4); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,5); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,6); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,7); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,8); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,9); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,10); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,11); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,12); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,13); + tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,14); + ((int8x16_t *)systematic2)[i]=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,15); +#endif } stop_meas(intl1_stats); @@ -956,6 +1388,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, if ((n2&0x7f) == 0) { // n2 is a multiple of 128 bits for (i=0; i<(n2>>4); i++) { +#if defined(__x86_64__) || defined(__i386__) tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],0); tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],1); tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],2); @@ -974,9 +1407,32 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15); decoded_bytes_interl[i]=(uint16_t) _mm_movemask_epi8(_mm_cmpgt_epi8(tmp,zeros)); ((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]); +#elif defined(__arm__) + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,3); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,4); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,5); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,6); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,7); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,8); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,9); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,10); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,11); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,12); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,13); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,14); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,15); + uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcgtq_s8(tmp,zeros), Powers)))); + vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0); + vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8); + ((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t*)ext)[i]),((int8x16_t *)systematic0)[i]); +#endif } } else { for (i=0; i<(n2>>4); i++) { +#if defined(__x86_64__) || defined(__i386__) tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],0); tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],1); tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],2); @@ -996,7 +1452,29 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, tmp128[i] = _mm_adds_epi8(((__m128i *)ext2)[i],((__m128i *)systematic2)[i]); ((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]); - } +#elif defined(__arm__) + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,3); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,4); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,5); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,6); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,7); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,8); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,9); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,10); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,11); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,12); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,13); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,14); + tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,15); + tmp128[i] = vqaddq_s8(((int8x16_t *)ext2)[i],((int8x16_t *)systematic2)[i]); + + ((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t*)ext)[i]),((int8x16_t *)systematic0)[i]); + +#endif + } } // Check if we decoded the block @@ -1007,6 +1485,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, // re-order the decoded bits in theregular order // as it is presently ordered as 16 sequential columns +#if defined(__x86__64) || defined(__i386__) __m128i* dbytes=(__m128i*)decoded_bytes_interl; __m128i shuffle=SHUFFLE16(7,6,5,4,3,2,1,0); __m128i mask __attribute__((aligned(16))); @@ -1031,10 +1510,31 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, decoded_bytes[n_128*j +i]=(uint8_t) _mm_movemask_epi8(_mm_packs_epi16(tmp2,zeros)); } } +#elif defined(__arm__) + uint8x16_t* dbytes=(uint8x16_t*)decoded_bytes_interl; + uint16x8_t mask __attribute__((aligned(16))); + int n_128=n2>>7; + + for (i=0; i<n_128; i++) { + mask=vdupq_n_u16(1); + uint8x16_t tmp __attribute__((aligned(16))); + tmp=vcombine_u8(vrev64_u8(((uint8x8_t*)&dbytes[i])[1]),vrev64_u8(((uint8x8_t*)&dbytes[i])[0])); + vst1q_lane_u8(&decoded_bytes[n_128*0+i],(uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers)))),0); + + int j; + + for (j=1; j<16; j++) { + mask=vshlq_n_u16(mask,1); + vst1q_lane_u8(&decoded_bytes[n_128*0+i],(uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers)))),0); + } + } + +#endif } else { pi6_p=pi6tab8[iind]; for (i=0; i<(n2>>4); i++) { +#if defined(__x86_64__) || defined(__i386__) tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],7); tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],6); tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],5); @@ -1053,6 +1553,27 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],8); tmp=_mm_cmpgt_epi8(tmp,zeros); ((uint16_t *)decoded_bytes)[i]=(uint16_t)_mm_movemask_epi8(tmp); +#elif defined(__arm__) + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,7); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,6); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,5); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,4); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,3); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,2); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,1); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,0); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,15); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,14); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,13); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,12); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,11); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,10); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,9); + tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,8); + uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcgtq_s8(tmp,zeros), Powers)))); + vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0); + vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8); +#endif } } @@ -1107,17 +1628,28 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y, // do a new iteration if it is not yet decoded if (iteration_cnt < max_iterations) { log_map8(systematic1,yparity1,m11,m10,alpha,beta,ext,n2,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats); +#if defined(__x86_64__) || defined(__i386__) __m128i* ext_128=(__m128i*) ext; __m128i* s1_128=(__m128i*) systematic1; __m128i* s0_128=(__m128i*) systematic0; +#elif defined(__arm__) + int8x16_t* ext_128=(int8x16_t*) ext; + int8x16_t* s1_128=(int8x16_t*) systematic1; + int8x16_t* s0_128=(int8x16_t*) systematic0; +#endif int myloop=n2>>4; for (i=0; i<myloop; i++) { +#if defined(__x86_64__) || defined(__i386__) *ext_128=_mm_adds_epi8(_mm_subs_epi8(*ext_128,*s1_128++),*s0_128++); +#elif defined(__arm__) + *ext_128=vqaddq_s8(vqsubq_s8(*ext_128,*s1_128++),*s0_128++); +#endif ext_128++; } } } return(iteration_cnt); + } diff --git a/openair1/PHY/CODING/Makefile b/openair1/PHY/CODING/Makefile index f9b15ebd27..b323c479d3 100644 --- a/openair1/PHY/CODING/Makefile +++ b/openair1/PHY/CODING/Makefile @@ -1,29 +1,13 @@ -TURBO_SRC = 3gpplte.c 3gpplte_turbo_decoder_sse.c crc_byte.c +TURBO_SRC = 3gpplte_sse.c 3gpplte_turbo_decoder_sse.c crc_byte.c RATE13CC_SRC = ccoding_byte_lte.c viterbi_lte.c crc_byte.c RATE12CC_SRC = ccoding_byte.c viterbi.c crc_byte.c -all: turbolte_test rate13cc_test rate12cc_test run_turbo run_rate13cc run_rate13ccdab run_rate12cc +all: 3gpplte_sse -turbolte_test: $(TURBO_SRC) - gcc -o turbo_test $(TURBO_SRC) -DTEST_DEBUG -DUSER_MODE -msse2 -mssse3 -Wall +3gpplte_sse: $(TURBO_SRC) + gcc -o 3gpplte_sse 3gpplte_sse.c -msse4 -Wall -g -ggdb -DMAIN -rate13cc_test: $(RATE13CC_SRC) - gcc -o rate13cc_test $(RATE13CC_SRC) -DTEST_DEBUG -DUSER_MODE -msse2 -mssse3 -Wall -rate12cc_test: $(RATE12CC_SRC) - gcc -o rate12cc_test $(RATE12CC_SRC) -DTEST_DEBUG -DUSER_MODE -msse2 -mssse3 -Wall - -run_turbo: turbolte_test - ./turbo_test - -run_rate13cc: rate13cc_test - ./rate13cc_test - -run_rate13ccdab: rate13cc_test - ./rate13cc_test -d - -run_rate12cc: rate12cc_test - ./rate12cc_test clean: rm *.o diff --git a/openair1/PHY/CODING/ccoding_byte_lte.c b/openair1/PHY/CODING/ccoding_byte_lte.c index d6f31b1ab4..b399d0186b 100644 --- a/openair1/PHY/CODING/ccoding_byte_lte.c +++ b/openair1/PHY/CODING/ccoding_byte_lte.c @@ -55,22 +55,22 @@ unsigned char ccodelte_table_rev[128]; // for receiver void -ccodelte_encode (unsigned int numbits, - unsigned char add_crc, - unsigned char *inPtr, - unsigned char *outPtr, - unsigned short rnti) +ccodelte_encode (int32_t numbits, + uint8_t add_crc, + uint8_t *inPtr, + uint8_t *outPtr, + uint16_t rnti) { - unsigned int state; + uint32_t state; - unsigned char c, out, first_bit; - char shiftbit=0; - unsigned short c16; - unsigned short next_last_byte=0; - unsigned int crc=0; + uint8_t c, out, first_bit; + int8_t shiftbit=0; + uint16_t c16; + uint16_t next_last_byte=0; + uint32_t crc=0; #ifdef DEBUG_CCODE - unsigned int dummy=0; + uint32_t dummy=0; #endif //DEBUG_CCODE /* The input bit is shifted in position 8 of the state. @@ -80,20 +80,19 @@ ccodelte_encode (unsigned int numbits, if (add_crc == 1) { crc = crc8(inPtr,numbits); first_bit = 2; - c = (unsigned char)(crc>>24); + c = (uint8_t)(crc>>24); } else if (add_crc == 2) { crc = crc16(inPtr,numbits); #ifdef DEBUG_CCODE printf("ccode_lte : crc %x\n",crc); #endif // scramble with RNTI - crc ^= (((unsigned int)rnti)<<16); + crc ^= (((uint32_t)rnti)<<16); #ifdef DEBUG_CCODE printf("ccode_lte : crc %x (rnti %x)\n",crc,rnti); #endif first_bit = 2; - // c = (unsigned char)(crc>>24); - c = (unsigned char)((crc>>16)&0xff); + c = (uint8_t)((crc>>16)&0xff); } else { next_last_byte = numbits>>3; first_bit = (numbits-6)&7; @@ -182,7 +181,7 @@ ccodelte_encode (unsigned int numbits, // now code 8-bit CRC for UCI if (add_crc == 1) { - c = (unsigned char)(crc>>24); + c = (uint8_t)(crc>>24); // for (shiftbit = 0; (shiftbit<8);shiftbit++) { for (shiftbit = 7; (shiftbit>=0); shiftbit--) { @@ -209,7 +208,7 @@ ccodelte_encode (unsigned int numbits, // now code 16-bit CRC for DCI if (add_crc == 2) { - c16 = (unsigned short)(crc>>16); + c16 = (uint16_t)(crc>>16); // for (shiftbit = 0; (shiftbit<16);shiftbit++) { for (shiftbit = 15; (shiftbit>=0); shiftbit--) { diff --git a/openair1/PHY/CODING/defs.h b/openair1/PHY/CODING/defs.h index dbd2e4790d..21767a1f33 100644 --- a/openair1/PHY/CODING/defs.h +++ b/openair1/PHY/CODING/defs.h @@ -320,7 +320,7 @@ void threegpplte_turbo_encoder(uint8_t *input, uint16_t interleaver_f2); -/** \fn void ccodelte_encode(uint32_t numbits,uint8_t add_crc, uint8_t *inPtr,uint8_t *outPtr,uint16_t rnti) +/** \fn void ccodelte_encode(int32_t numbits,uint8_t add_crc, uint8_t *inPtr,uint8_t *outPtr,uint16_t rnti) \brief This function implements the LTE convolutional code of rate 1/3 with a constraint length of 7 bits. The inputs are bit packed in octets (from MSB to LSB). Trellis tail-biting is included here. @@ -331,7 +331,7 @@ void threegpplte_turbo_encoder(uint8_t *input, @param rnti RNTI for CRC scrambling */ void -ccodelte_encode (uint32_t numbits, +ccodelte_encode (int32_t numbits, uint8_t add_crc, uint8_t *inPtr, uint8_t *outPtr, diff --git a/openair1/PHY/CODING/viterbi.c b/openair1/PHY/CODING/viterbi.c index 4b0fb0c70a..118e0ef2d8 100755 --- a/openair1/PHY/CODING/viterbi.c +++ b/openair1/PHY/CODING/viterbi.c @@ -33,9 +33,8 @@ */ -#ifndef EXPRESSMIMO_TARGET + #include "PHY/sse_intrin.h" -#endif //EXPRESSMIMO_TARGET extern unsigned char ccodedot11_table[128],ccodedot11_table_rev[128]; @@ -46,12 +45,6 @@ static unsigned char inputs[64][2048]; static unsigned short survivors[64][2048]; static short partial_metrics[64],partial_metrics_new[64]; -#ifdef __KERNEL__ -#define printf rt_printk -#endif - -#ifndef EXPRESSMIMO_TARGET - void phy_viterbi_dot11(char *y,unsigned char *decoded_bytes,unsigned short n) { @@ -191,22 +184,34 @@ void phy_generate_viterbi_tables(void) #define INIT0 0x00000080 -#define RESCALE 0x00000040 - - -static __m128i __attribute__((aligned(16))) TB[4*4095*8]; -static __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31, - TBodd33_63 __attribute__((aligned(16))); -static __m128i rescale,min_state,min_state2 __attribute__((aligned(16))); void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short n,int offset, int traceback ) { +#if defined(__x86_64__) || defined(__i386__) + __m128i TB[4*4095*8]; // 4 __m128i per input bit (64 states, 8-bits per state = 16-way), 4095 is largest packet size in bytes, 8 bits/byte + + __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63; + + __m128i min_state,min_state2; + __m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[offset<<2]; +#elif defined(__arm__) + uint8x16x2_t TB[2*4095*8]; // 2 int8x16_t per input bit, 8 bits / byte, 4095 is largest packet size in bytes + + uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63; + uint8x16x2_t metrics0_31,metrics32_63; + + uint8x16_t min_state; + + uint8x16_t *m0_ptr,*m1_ptr; + uint8x16x2_t *TB_ptr = &TB[offset<<1]; + +#endif char *in = y; unsigned char prev_state0; @@ -216,6 +221,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short short position; // printf("offset %d, TB_ptr %p\n",offset,TB_ptr); +#if defined(__x86_64__) || defined(__i386__) if (offset == 0) { // set initial metrics @@ -225,129 +231,64 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short metrics48_63 = _mm_setzero_si128(); } - rescale = _mm_cvtsi32_si128(RESCALE); +#elif defined(__arm__) + if (offset == 0) { + // set initial metrics - /* - print_bytes(metrics0_15,"metrics0_15"); - print_bytes(metrics16_31,"metrics16_31"); - print_bytes(metrics32_47,"metrics32_47"); - print_bytes(metrics48_63,"metrics48_63"); - */ + metrics0_31.val[0] = vdupq_n_u8(0); metrics0_31.val[0] = vsetq_lane_u8(INIT0,metrics0_31.val[0],0); + metrics0_31.val[1] = vdupq_n_u8(0); + metrics32_63.val[0] = vdupq_n_u8(0); + metrics32_63.val[1] = vdupq_n_u8(0); + } - for (position=offset; position<(offset+n); position++) { +#endif + for (position=offset; position<(offset+n); position++) { //printf("%d : (%d,%d)\n",position,in[0],in[1]); // get branch metric offsets for the 64 states table_offset = (in[0]+8 + ((in[1]+8)<<4))<<6; - // printf("Table_offset = %u (in[0]=%d,in[1]=%d)\n",table_offset,in[0],in[1]); - +#if defined(__x86_64__) || defined(__i386__) m0_ptr = (__m128i *)&m0_table[table_offset]; m1_ptr = (__m128i *)&m1_table[table_offset]; - // printf("\n"); // even states even0_30a = _mm_adds_epu8(metrics0_15,m0_ptr[0]); - // print_bytes(even0_30a,"even0_30a"); - even32_62a = _mm_adds_epu8(metrics16_31,m0_ptr[1]); - // print_bytes(even32_62a,"even32_62a"); - even0_30b = _mm_adds_epu8(metrics32_47,m0_ptr[2]); - // print_bytes(even0_30b,"even0_30b"); - even32_62b = _mm_adds_epu8(metrics48_63,m0_ptr[3]); - // print_bytes(even32_62b,"even32_62b"); - - // printf("\n"); // odd states odd1_31a = _mm_adds_epu8(metrics0_15,m1_ptr[0]); - - // print_bytes(odd1_31a,"odd1_31a"); - odd33_63a = _mm_adds_epu8(metrics16_31,m1_ptr[1]); - - // print_bytes(odd33_63a,"odd33_63a"); - odd1_31b = _mm_adds_epu8(metrics32_47,m1_ptr[2]); - - // print_bytes(odd1_31b,"odd1_31b"); - odd33_63b = _mm_adds_epu8(metrics48_63,m1_ptr[3]); - - // print_bytes(odd33_63b,"odd33_63b"); - - - - // select maxima - // printf("\n"); - even0_30a = _mm_max_epu8(even0_30a,even0_30b); - - // print_bytes(even0_30a,"even0_30a"); - even32_62a = _mm_max_epu8(even32_62a,even32_62b); - - // print_bytes(even32_62a,"even32_62a"); - odd1_31a = _mm_max_epu8(odd1_31a,odd1_31b); - - // print_bytes(odd1_31a,"odd1_31a"); - odd33_63a = _mm_max_epu8(odd33_63a,odd33_63b); - // print_bytes(odd33_63a,"odd33_63a"); - - // printf("\n"); // Traceback information - TBeven0_30 = _mm_cmpeq_epi8(even0_30a,even0_30b); - - TBeven32_62 = _mm_cmpeq_epi8(even32_62a,even32_62b); - - TBodd1_31 = _mm_cmpeq_epi8(odd1_31a,odd1_31b); - - TBodd33_63 = _mm_cmpeq_epi8(odd33_63a,odd33_63b); - metrics0_15 = _mm_unpacklo_epi8(even0_30a ,odd1_31a); metrics16_31 = _mm_unpackhi_epi8(even0_30a ,odd1_31a); metrics32_47 = _mm_unpacklo_epi8(even32_62a,odd33_63a); metrics48_63 = _mm_unpackhi_epi8(even32_62a,odd33_63a); - - //print_bytes(metrics0_15,"metrics0_15"); - //print_bytes(metrics16_31,"metrics16_31"); - //print_bytes(metrics32_47,"metrics32_47"); - //print_bytes(metrics48_63,"metrics48_63"); - - - - TB_ptr[0] = _mm_unpacklo_epi8(TBeven0_30,TBodd1_31); - - // print_bytes(TB_ptr[0],"TB0_15"); - + TB_ptr[0] = _mm_unpacklo_epi8(TBeven0_30,TBodd1_31); TB_ptr[1] = _mm_unpackhi_epi8(TBeven0_30,TBodd1_31); - - // print_bytes(TB_ptr[1],"TB16_31"); - TB_ptr[2] = _mm_unpacklo_epi8(TBeven32_62,TBodd33_63); - - // print_bytes(TB_ptr[2],"TB32_47"); - TB_ptr[3] = _mm_unpackhi_epi8(TBeven32_62,TBodd33_63); - // print_bytes(TB_ptr[3],"TB48_63"); - in+=2; TB_ptr += 4; @@ -359,50 +300,92 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short min_state =_mm_min_epu8(min_state,metrics32_47); min_state =_mm_min_epu8(min_state,metrics48_63); - // print_bytes(min_state,"min_state"); min_state2 = min_state; min_state = _mm_unpacklo_epi8(min_state,min_state); min_state2 = _mm_unpackhi_epi8(min_state2,min_state2); min_state = _mm_min_epu8(min_state,min_state2); - // print_bytes(min_state,"min_state"); - min_state2 = min_state; min_state = _mm_unpacklo_epi8(min_state,min_state); min_state2 = _mm_unpackhi_epi8(min_state2,min_state2); min_state = _mm_min_epu8(min_state,min_state2); - // print_bytes(min_state,"min_state"); - min_state2 = min_state; min_state = _mm_unpacklo_epi8(min_state,min_state); min_state2 = _mm_unpackhi_epi8(min_state2,min_state2); min_state = _mm_min_epu8(min_state,min_state2); - // print_bytes(min_state,"min_state"); - min_state2 = min_state; min_state = _mm_unpacklo_epi8(min_state,min_state); min_state2 = _mm_unpackhi_epi8(min_state2,min_state2); min_state = _mm_min_epu8(min_state,min_state2); - // print_bytes(min_state,"min_state"); - metrics0_15 = _mm_subs_epu8(metrics0_15,min_state); metrics16_31 = _mm_subs_epu8(metrics16_31,min_state); metrics32_47 = _mm_subs_epu8(metrics32_47,min_state); metrics48_63 = _mm_subs_epu8(metrics48_63,min_state); +#elif defined(__arm__) + m0_ptr = (uint8x16_t *)&m0_table[table_offset]; + m1_ptr = (uint8x16_t *)&m1_table[table_offset]; + + + // even states + even0_30a = vqaddq_u8(metrics0_31.val[0],m0_ptr[0]); + even32_62a = vqaddq_u8(metrics0_31.val[1],m0_ptr[1]); + even0_30b = vqaddq_u8(metrics32_63.val[0],m0_ptr[2]); + even32_62b = vqaddq_u8(metrics32_63.val[1],m0_ptr[3]); + + // odd states + odd1_31a = vqaddq_u8(metrics0_31.val[0],m1_ptr[0]); + odd33_63a = vqaddq_u8(metrics0_31.val[1],m1_ptr[1]); + odd1_31b = vqaddq_u8(metrics32_63.val[0],m1_ptr[2]); + odd33_63b = vqaddq_u8(metrics32_63.val[1],m1_ptr[3]); + // select maxima + even0_30a = vmaxq_u8(even0_30a,even0_30b); + even32_62a = vmaxq_u8(even32_62a,even32_62b); + odd1_31a = vmaxq_u8(odd1_31a,odd1_31b); + odd33_63a = vmaxq_u8(odd33_63a,odd33_63b); + + // Traceback information + TBeven0_30 = vceqq_u8(even0_30a,even0_30b); + TBeven32_62 = vceqq_u8(even32_62a,even32_62b); + TBodd1_31 = vceqq_u8(odd1_31a,odd1_31b); + TBodd33_63 = vceqq_u8(odd33_63a,odd33_63b); - /* - print_bytes(metrics0_15,"metrics0_15"); - print_bytes(metrics16_31,"metrics16_31"); - print_bytes(metrics32_47,"metrics32_47"); - print_bytes(metrics48_63,"metrics48_63"); - */ + metrics0_31 = vzipq_u8(even0_30a,odd1_31a); + metrics32_63 = vzipq_u8(even32_62a,odd33_63a); + TB_ptr[0] = vzipq_u8(TBeven0_30,TBodd1_31); + TB_ptr[1] = vzipq_u8(TBeven32_62,TBodd33_63); + in+=2; + TB_ptr += 2; + + // rescale by subtracting minimum + /**************************************************** + USE SSSE instruction phminpos!!!!!!! + ****************************************************/ + min_state =vminq_u8(metrics0_31.val[0],metrics0_31.val[1]); + min_state =vminq_u8(min_state,metrics32_63.val[0]); + min_state =vminq_u8(min_state,metrics32_63.val[1]); + // here we have 16 maximum metrics from the 64 states + uint8x8_t min_state2 = vpmin_u8(((uint8x8_t*)&min_state)[0],((uint8x8_t*)&min_state)[0]); + // now the 8 maximum in min_state2 + min_state2 = vpmin_u8(min_state2,min_state2); + // now the 4 maximum in min_state2, repeated twice + min_state2 = vpmin_u8(min_state2,min_state2); + // now the 2 maximum in min_state2, repeated 4 times + min_state2 = vpmin_u8(min_state2,min_state2); + // now the 1 maximum in min_state2, repeated 8 times + min_state = vcombine_u8(min_state2,min_state2); + // now the 1 maximum in min_state, repeated 16 times + metrics0_31.val[0] = vqsubq_u8(metrics0_31.val[0],min_state); + metrics0_31.val[1] = vqsubq_u8(metrics0_31.val[1],min_state); + metrics32_63.val[0] = vqsubq_u8(metrics32_63.val[0],min_state); + metrics32_63.val[1] = vqsubq_u8(metrics32_63.val[1],min_state); +#endif } // Traceback @@ -429,29 +412,10 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short } } +#if defined(__x86_64) || defined(__i386__) _mm_empty(); - -} - -#else //EXPRESSMIMO_TARGET - -void phy_viterbi_dot11(char *y,unsigned char *decoded_bytes,unsigned short n) -{ -} - -#endif //EXPRESSMIMO_TARGET - -/* -void print_bytes(__m128i x,char *s) { - - unsigned char *tempb = (unsigned char *)&x; - - printf("%s : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",s, - tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7], - tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]); - +#endif } -*/ #ifdef TEST_DEBUG #include <stdio.h> diff --git a/openair1/PHY/CODING/viterbi_lte.c b/openair1/PHY/CODING/viterbi_lte.c index 019cf0fc5c..230b933f79 100644 --- a/openair1/PHY/CODING/viterbi_lte.c +++ b/openair1/PHY/CODING/viterbi_lte.c @@ -49,21 +49,14 @@ #define msg printf #endif -#ifndef EXPRESSMIMO_TARGET + #include "PHY/sse_intrin.h" -#endif //EXPRESSMIMO_TARGET extern uint8_t ccodelte_table[128],ccodelte_table_rev[128]; -#ifdef __KERNEL__ -#define printf rt_printk -#endif - -#ifndef EXPRESSMIMO_TARGET - static int8_t m0_table[64*16*16*16] __attribute__ ((aligned(16))); static int8_t m1_table[64*16*16*16] __attribute__ ((aligned(16))); @@ -143,20 +136,33 @@ void print_shorts(__m128i x,char *s) { #endif // USER_MODE -static __m128i TB[4*8192]; - -static __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31, - TBodd33_63;// __attribute__((aligned(16))); - -static __m128i min_state,min_state2;// __attribute__((aligned(16))); void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) { - static __m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[0]; +#if defined(__x86_64__) || defined(__i386__) + __m128i TB[4*8192]; + __m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[0]; + + __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31, + TBodd33_63; + + __m128i min_state,min_state2; +#elif defined(__arm__) + uint8x16x2_t TB[2*8192]; // 2 int8x16_t per input bit, 8 bits / byte, 8192 is largest packet size in bits + uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63; + uint8x16x2_t metrics0_31,metrics32_63; + + uint8x16_t min_state; + + uint8x16_t *m0_ptr,*m1_ptr; + uint8x16x2_t *TB_ptr = &TB[0]; + + +#endif int8_t *in = y; uint8_t prev_state0,maxm,s; static uint8_t *TB_ptr2; @@ -167,140 +173,70 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) // set initial metrics //debug_msg("Doing viterbi\n"); - metrics0_15 = _mm_setzero_si128(); +#if defined(__x86_64__) || defined(__i386__) + + metrics0_15 = _mm_setzero_si128(); metrics16_31 = _mm_setzero_si128(); metrics32_47 = _mm_setzero_si128(); metrics48_63 = _mm_setzero_si128(); -#ifndef USER_MODE - //debug_msg("Doing viterbi 2\n"); +#elif defined(__arm__) + metrics0_31.val[0] = vdupq_n_u8(0); + metrics0_31.val[1] = vdupq_n_u8(0); + metrics32_63.val[0] = vdupq_n_u8(0); + metrics32_63.val[1] = vdupq_n_u8(0); #endif - /* - print_bytes(metrics0_15,"metrics0_15"); - print_bytes(metrics16_31,"metrics16_31"); - print_bytes(metrics32_47,"metrics32_47"); - print_bytes(metrics48_63,"metrics48_63"); - */ for (iter=0; iter<2; iter++) { in = y; TB_ptr=&TB[0]; - // printf("Iteration %d\n",iter); for (position=0; position<n; position++) { - // printf("%d/%d : (%d,%d,%d)\n",position,n-1,in[0],in[1],in[2]); - - // get branch metric offsets for the 64 states table_offset = (in[0]+8 + ((in[1]+8)<<4) + ((in[2]+8)<<8))<<6; - /* - printf("Table_offset = %u (in[0]=%d,in[1]=%d,in[2]=%d)\n",table_offset,in[0],in[1],in[2]); - print_bytes("m0",&m0_table[table_offset]); - print_bytes("m1",&m1_table[table_offset]); - */ + +#if defined(__x86_64__) || defined(__i386__) m0_ptr = (__m128i *)&m0_table[table_offset]; m1_ptr = (__m128i *)&m1_table[table_offset]; - // printf("\n"); - // even states even0_30a = _mm_adds_epu8(metrics0_15,m0_ptr[0]); - // print_bytes(even0_30a,"even0_30a"); - even32_62a = _mm_adds_epu8(metrics16_31,m0_ptr[1]); - // print_bytes(even32_62a,"even32_62a"); - even0_30b = _mm_adds_epu8(metrics32_47,m0_ptr[2]); - // print_bytes(even0_30b,"even0_30b"); - even32_62b = _mm_adds_epu8(metrics48_63,m0_ptr[3]); - // print_bytes(even32_62b,"even32_62b"); - // printf("\n"); // odd states odd1_31a = _mm_adds_epu8(metrics0_15,m1_ptr[0]); - - // print_bytes(odd1_31a,"odd1_31a"); - odd33_63a = _mm_adds_epu8(metrics16_31,m1_ptr[1]); - - // print_bytes(odd33_63a,"odd33_63a"); - odd1_31b = _mm_adds_epu8(metrics32_47,m1_ptr[2]); - - // print_bytes(odd1_31b,"odd1_31b"); - odd33_63b = _mm_adds_epu8(metrics48_63,m1_ptr[3]); - // print_bytes(odd33_63b,"odd33_63b"); - - - - // select maxima - // printf("\n"); even0_30a = _mm_max_epu8(even0_30a,even0_30b); - - // print_bytes(even0_30a,"even0_30a"); - even32_62a = _mm_max_epu8(even32_62a,even32_62b); - - // print_bytes(even32_62a,"even32_62a"); - odd1_31a = _mm_max_epu8(odd1_31a,odd1_31b); - - // print_bytes(odd1_31a,"odd1_31a"); - odd33_63a = _mm_max_epu8(odd33_63a,odd33_63b); - // print_bytes(odd33_63a,"odd33_63a"); - - // printf("\n"); // Traceback information TBeven0_30 = _mm_cmpeq_epi8(even0_30a,even0_30b); - - TBeven32_62 = _mm_cmpeq_epi8(even32_62a,even32_62b); - - TBodd1_31 = _mm_cmpeq_epi8(odd1_31a,odd1_31b); - - TBodd33_63 = _mm_cmpeq_epi8(odd33_63a,odd33_63b); - metrics0_15 = _mm_unpacklo_epi8(even0_30a ,odd1_31a); metrics16_31 = _mm_unpackhi_epi8(even0_30a ,odd1_31a); metrics32_47 = _mm_unpacklo_epi8(even32_62a,odd33_63a); metrics48_63 = _mm_unpackhi_epi8(even32_62a,odd33_63a); - /* - print_bytes(metrics0_15,"metrics0_15"); - print_bytes(metrics16_31,"metrics16_31"); - print_bytes(metrics32_47,"metrics32_47"); - print_bytes(metrics48_63,"metrics48_63"); - */ - - TB_ptr[0] = _mm_unpacklo_epi8(TBeven0_30,TBodd1_31); - - // print_bytes(TB_ptr[0],"TB0_15"); - TB_ptr[1] = _mm_unpackhi_epi8(TBeven0_30,TBodd1_31); - - // print_bytes(TB_ptr[1],"TB16_31"); - TB_ptr[2] = _mm_unpacklo_epi8(TBeven32_62,TBodd33_63); - - // print_bytes(TB_ptr[2],"TB32_47"); - TB_ptr[3] = _mm_unpackhi_epi8(TBeven32_62,TBodd33_63); - // print_bytes(TB_ptr[3],"TB48_63"); in+=3; TB_ptr += 4; @@ -313,50 +249,90 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) min_state =_mm_min_epu8(min_state,metrics32_47); min_state =_mm_min_epu8(min_state,metrics48_63); - // print_bytes(min_state,"min_state"); - min_state2 = min_state; min_state = _mm_unpacklo_epi8(min_state,min_state); min_state2 = _mm_unpackhi_epi8(min_state2,min_state2); min_state = _mm_min_epu8(min_state,min_state2); - // print_bytes(min_state,"min_state"); - min_state2 = min_state; min_state = _mm_unpacklo_epi8(min_state,min_state); min_state2 = _mm_unpackhi_epi8(min_state2,min_state2); min_state = _mm_min_epu8(min_state,min_state2); - // print_bytes(min_state,"min_state"); - min_state2 = min_state; min_state = _mm_unpacklo_epi8(min_state,min_state); min_state2 = _mm_unpackhi_epi8(min_state2,min_state2); min_state = _mm_min_epu8(min_state,min_state2); - // print_bytes(min_state,"min_state"); - min_state2 = min_state; min_state = _mm_unpacklo_epi8(min_state,min_state); min_state2 = _mm_unpackhi_epi8(min_state2,min_state2); min_state = _mm_min_epu8(min_state,min_state2); - // print_bytes(min_state,"min_state"); - metrics0_15 = _mm_subs_epu8(metrics0_15,min_state); metrics16_31 = _mm_subs_epu8(metrics16_31,min_state); metrics32_47 = _mm_subs_epu8(metrics32_47,min_state); metrics48_63 = _mm_subs_epu8(metrics48_63,min_state); - - /* - print_bytes("metrics0_15",&metrics0_15); - print_bytes("metrics16_31",&metrics16_31); - print_bytes("metrics32_47",&metrics32_47); - print_bytes("metrics48_63",&metrics48_63); - - printf("\n"); - */ - +#elif defined(__arm__) + m0_ptr = (uint8x16_t *)&m0_table[table_offset]; + m1_ptr = (uint8x16_t *)&m1_table[table_offset]; + + + // even states + even0_30a = vqaddq_u8(metrics0_31.val[0],m0_ptr[0]); + even32_62a = vqaddq_u8(metrics0_31.val[1],m0_ptr[1]); + even0_30b = vqaddq_u8(metrics32_63.val[0],m0_ptr[2]); + even32_62b = vqaddq_u8(metrics32_63.val[1],m0_ptr[3]); + + // odd states + odd1_31a = vqaddq_u8(metrics0_31.val[0],m1_ptr[0]); + odd33_63a = vqaddq_u8(metrics0_31.val[1],m1_ptr[1]); + odd1_31b = vqaddq_u8(metrics32_63.val[0],m1_ptr[2]); + odd33_63b = vqaddq_u8(metrics32_63.val[1],m1_ptr[3]); + // select maxima + even0_30a = vmaxq_u8(even0_30a,even0_30b); + even32_62a = vmaxq_u8(even32_62a,even32_62b); + odd1_31a = vmaxq_u8(odd1_31a,odd1_31b); + odd33_63a = vmaxq_u8(odd33_63a,odd33_63b); + + // Traceback information + TBeven0_30 = vceqq_u8(even0_30a,even0_30b); + TBeven32_62 = vceqq_u8(even32_62a,even32_62b); + TBodd1_31 = vceqq_u8(odd1_31a,odd1_31b); + TBodd33_63 = vceqq_u8(odd33_63a,odd33_63b); + + metrics0_31 = vzipq_u8(even0_30a,odd1_31a); + metrics32_63 = vzipq_u8(even32_62a,odd33_63a); + + TB_ptr[0] = vzipq_u8(TBeven0_30,TBodd1_31); + TB_ptr[1] = vzipq_u8(TBeven32_62,TBodd33_63); + + in+=2; + TB_ptr += 2; + + // rescale by subtracting minimum + /**************************************************** + USE SSSE instruction phminpos!!!!!!! + ****************************************************/ + min_state =vminq_u8(metrics0_31.val[0],metrics0_31.val[1]); + min_state =vminq_u8(min_state,metrics32_63.val[0]); + min_state =vminq_u8(min_state,metrics32_63.val[1]); + // here we have 16 maximum metrics from the 64 states + uint8x8_t min_state2 = vpmin_u8(((uint8x8_t*)&min_state)[0],((uint8x8_t*)&min_state)[0]); + // now the 8 maximum in min_state2 + min_state2 = vpmin_u8(min_state2,min_state2); + // now the 4 maximum in min_state2, repeated twice + min_state2 = vpmin_u8(min_state2,min_state2); + // now the 2 maximum in min_state2, repeated 4 times + min_state2 = vpmin_u8(min_state2,min_state2); + // now the 1 maximum in min_state2, repeated 8 times + min_state = vcombine_u8(min_state2,min_state2); + // now the 1 maximum in min_state, repeated 16 times + metrics0_31.val[0] = vqsubq_u8(metrics0_31.val[0],min_state); + metrics0_31.val[1] = vqsubq_u8(metrics0_31.val[1],min_state); + metrics32_63.val[0] = vqsubq_u8(metrics32_63.val[0],min_state); + metrics32_63.val[1] = vqsubq_u8(metrics32_63.val[1],min_state); +#endif } } // iteration @@ -365,6 +341,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) prev_state0 = 0; maxm = 0; +#if defined(__x86_64__) || defined(__i386__) for (s=0; s<16; s++) if (((uint8_t *)&metrics0_15)[s] > maxm) { maxm = ((uint8_t *)&metrics0_15)[s]; @@ -389,17 +366,39 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) prev_state0 = s+48; } - // printf("Max state %d\n",prev_state0); + +#elif defined(__arm__) + for (s=0; s<16; s++) + if (((uint8_t *)&metrics0_31.val[0])[s] > maxm) { + maxm = ((uint8_t *)&metrics0_31.val[0])[s]; + prev_state0 = s; + } + + for (s=0; s<16; s++) + if (((uint8_t *)&metrics0_31.val[1])[s] > maxm) { + maxm = ((uint8_t *)&metrics0_31.val[1])[s]; + prev_state0 = s+16; + } + + for (s=0; s<16; s++) + if (((uint8_t *)&metrics32_63.val[0])[s] > maxm) { + maxm = ((uint8_t *)&metrics32_63.val[0])[s]; + prev_state0 = s+32; + } + + for (s=0; s<16; s++) + if (((uint8_t *)&metrics32_63.val[1])[s] > maxm) { + maxm = ((uint8_t *)&metrics32_63.val[1])[s]; + prev_state0 = s+48; + } +#endif + TB_ptr2 = (uint8_t *)&TB[(n-1)*4]; for (position = n-1 ; position>-1; position--) { - // if ((position%8) == 0) - // printf("%d: %x\n",1+(position>>3),decoded_bytes[1+(position>>3)]); - decoded_bytes[(position)>>3] += (prev_state0 & 0x1)<<(7-(position & 0x7)); - // printf("pos %d : ps = %d -> %d\n",position,prev_state0,TB_ptr2[prev_state0]); if (TB_ptr2[prev_state0] == 0) prev_state0 = (prev_state0 >> 1); @@ -409,31 +408,12 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) TB_ptr2-=64; } - // printf("Max state %d\n",prev_state0); + +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - -} - -#else //EXPRESSMIMO_TARGET - -void phy_viterbi_lte(int8_t *y,uint8_t *decoded_bytes,uint16_t n) -{ -} - -#endif //EXPRESSMIMO_TARGET - -/* -void print_bytes(__m128i x,int8_t *s) { - - uint8_t *tempb = (uint8_t *)&x; - - printf("%s : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",s, - tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7], - tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]); - +#endif } -*/ #ifdef TEST_DEBUG int test_viterbi(uint8_t dabflag) diff --git a/openair1/PHY/LTE_ESTIMATION/filt96_32.h b/openair1/PHY/LTE_ESTIMATION/filt96_32.h index 36eac7ea30..c92be225bd 100644 --- a/openair1/PHY/LTE_ESTIMATION/filt96_32.h +++ b/openair1/PHY/LTE_ESTIMATION/filt96_32.h @@ -26,187 +26,187 @@ Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE *******************************************************************************/ -short filt24_0[24] = { +short filt24_0[24] __attribute__((aligned(16))) ={ 2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_0_dcl[24] = { +short filt24_0_dcl[24] __attribute__((aligned(16))) ={ 2341,4681,7022,9362,11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_0_dcr[24] = { +short filt24_0_dcr[24] __attribute__((aligned(16))) ={ 2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_1[24] = { +short filt24_1[24] __attribute__((aligned(16))) ={ 0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_1_dcl[24] = { +short filt24_1_dcl[24] __attribute__((aligned(16))) ={ 0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_1_dcr[24] = { +short filt24_1_dcr[24] __attribute__((aligned(16))) ={ 0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_2[24] = { +short filt24_2[24] __attribute__((aligned(16))) ={ 0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_2_dcl[24] = { +short filt24_2_dcl[24] __attribute__((aligned(16))) ={ 0,0,2341,4681,7022,9362, 11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_2_dcr[24] = { +short filt24_2_dcr[24] __attribute__((aligned(16))) ={ 0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,4681,2341,0,0,0,0,0,0,0,0,0,0,0 }; // X X X Y | X X X X | X Y X X -short filt24_3[24] = { +short filt24_3[24] __attribute__((aligned(16))) ={ 0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0 }; -short filt24_3_dcl[24] = { +short filt24_3_dcl[24] __attribute__((aligned(16))) ={ 0,0,0,2341,4681,7022,9362,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0 }; // X X X Y | X X DC X X | X Y X X -short filt24_3_dcr[24] = { +short filt24_3_dcr[24] __attribute__((aligned(16))) ={ 0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0,0 }; -short filt24_4[24] = { +short filt24_4[24] __attribute__((aligned(16))) ={ 0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0 }; -short filt24_4_dcl[24] = { +short filt24_4_dcl[24] __attribute__((aligned(16))) ={ 0,0,0,0,2341,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0 }; -short filt24_4_dcr[24] = { +short filt24_4_dcr[24] __attribute__((aligned(16))) ={ 0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0 }; -short filt24_5[24] = { +short filt24_5[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0 }; // X X X Y | X X DC X X | X Y X X -short filt24_5_dcl[24] = { +short filt24_5_dcl[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,2341,4681,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0 }; -short filt24_5_dcr[24] = { +short filt24_5_dcr[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,2730,5461,8192,10922,13653,16384,11703,9362,7022,4681,2730,0,0,0,0,0,0,0,0 }; -short filt24_6[24] = { +short filt24_6[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0 }; -short filt24_6_dcl[24] = { +short filt24_6_dcl[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0 }; -short filt24_6_dcr[24] = { +short filt24_6_dcr[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0 }; -short filt24_7[24] = { +short filt24_7[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0 }; -short filt24_7_dcl[24] = { +short filt24_7_dcl[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0 }; -short filt24_7_dcr[24] = { +short filt24_7_dcr[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0 }; -short filt24_0l[24] = { +short filt24_0l[24] __attribute__((aligned(16))) ={ 30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_1l[24] = { +short filt24_1l[24] __attribute__((aligned(16))) ={ 0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_2l[24] = { +short filt24_2l[24] __attribute__((aligned(16))) ={ 0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_3l[24] = { +short filt24_3l[24] __attribute__((aligned(16))) ={ //0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0}; 0,0,0,0,0,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0 }; -short filt24_4l[24] = { +short filt24_4l[24] __attribute__((aligned(16))) ={ 0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0 }; -short filt24_5l[24] = { +short filt24_5l[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0 }; -short filt24_6l[24] = { +short filt24_6l[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0 }; -short filt24_7l[24] = { +short filt24_7l[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0 }; -short filt24_0l2[24] = { +short filt24_0l2[24] __attribute__((aligned(16))) ={ 2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_1l2[24] = { +short filt24_1l2[24] __attribute__((aligned(16))) ={ 0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_2l2[24] = { +short filt24_2l2[24] __attribute__((aligned(16))) ={ -2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_3l2[24] = { +short filt24_3l2[24] __attribute__((aligned(16))) ={ -5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0 }; -short filt24_4l2[24] = { +short filt24_4l2[24] __attribute__((aligned(16))) ={ -8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0 }; -short filt24_5l2[24] = { +short filt24_5l2[24] __attribute__((aligned(16))) ={ 0,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0 }; -short filt24_6l2[24] = { +short filt24_6l2[24] __attribute__((aligned(16))) ={ -13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0 }; -short filt24_7l2[24] = { +short filt24_7l2[24] __attribute__((aligned(16))) ={ 0,-13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0 }; -short filt24_0r[24] = { +short filt24_0r[24] __attribute__((aligned(16))) ={ 2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_1r[24] = { +short filt24_1r[24] __attribute__((aligned(16))) ={ 0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_2r[24] = { +short filt24_2r[24] __attribute__((aligned(16))) ={ 0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0 }; -short filt24_3r[24] = { +short filt24_3r[24] __attribute__((aligned(16))) ={ 0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0 }; -short filt24_4r[24] = { +short filt24_4r[24] __attribute__((aligned(16))) ={ 0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0 }; -short filt24_5r[24] = { +short filt24_5r[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0 }; -short filt24_6r[24] = { +short filt24_6r[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0 }; -short filt24_7r[24] = { +short filt24_7r[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0 }; -short filt24_0r2[24] = { /****/ +short filt24_0r2[24] __attribute__((aligned(16))) ={ /****/ 2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0,0 }; -short filt24_1r2[24] = { +short filt24_1r2[24] __attribute__((aligned(16))) ={ 0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0 }; -short filt24_2r2[24] = { +short filt24_2r2[24] __attribute__((aligned(16))) ={ 0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0 }; -short filt24_3r2[24] = { +short filt24_3r2[24] __attribute__((aligned(16))) ={ 0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0 }; -short filt24_4r2[24] = { +short filt24_4r2[24] __attribute__((aligned(16))) ={ 0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0 }; -short filt24_5r2[24] = { +short filt24_5r2[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0 }; -short filt24_6r2[24] = { +short filt24_6r2[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0 }; -short filt24_7r2[24] = { +short filt24_7r2[24] __attribute__((aligned(16))) ={ 0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653 }; diff --git a/openair1/PHY/LTE_ESTIMATION/freq_equalization.c b/openair1/PHY/LTE_ESTIMATION/freq_equalization.c index 6e71e23ec9..595d1e7a48 100755 --- a/openair1/PHY/LTE_ESTIMATION/freq_equalization.c +++ b/openair1/PHY/LTE_ESTIMATION/freq_equalization.c @@ -299,11 +299,17 @@ void freq_equalization(LTE_DL_FRAME_PARMS *frame_parms, { uint16_t re; int16_t amp; +#if defined(__x86_64__) || defined(__i386__) __m128i *ul_ch_mag128,*ul_ch_magb128,*rxdataF_comp128; - rxdataF_comp128 = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_magb128 = (__m128i *)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; +#elif defined(__arm__) + int16x8_t *ul_ch_mag128,*ul_ch_magb128,*rxdataF_comp128; + rxdataF_comp128 = (int16x8_t*)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; + ul_ch_mag128 = (int16x8_t*)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; + ul_ch_magb128 = (int16x8_t*)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; +#endif for (re=0; re<(Msc_RS>>2); re++) { @@ -313,15 +319,25 @@ void freq_equalization(LTE_DL_FRAME_PARMS *frame_parms, amp=255; // printf("freq_eq: symbol %d re %d => %d,%d,%d, (%d) (%d,%d) => ",symbol,re,*((int16_t*)(&ul_ch_mag128[re])),amp,inv_ch[8*amp],*((int16_t*)(&ul_ch_mag128[re]))*inv_ch[8*amp],*(int16_t*)&(rxdataF_comp128[re]),*(1+(int16_t*)&(rxdataF_comp128[re]))); +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128[re] = _mm_mullo_epi16(rxdataF_comp128[re],*((__m128i *)&inv_ch[8*amp])); if (Qm==4) - ul_ch_mag128[re] = _mm_set1_epi16(324); // this is 512*2/sqrt(10) + ul_ch_mag128[re] = _mm_set1_epi16(324); // this is 512*2/sqrt(10) else { - ul_ch_mag128[re] = _mm_set1_epi16(316); // this is 512*4/sqrt(42) + ul_ch_mag128[re] = _mm_set1_epi16(316); // this is 512*4/sqrt(42) ul_ch_magb128[re] = _mm_set1_epi16(158); // this is 512*2/sqrt(42) } +#elif defined(__arm__) + rxdataF_comp128[re] = vmulq_s16(rxdataF_comp128[re],*((int16x8_t *)&inv_ch[8*amp])); + if (Qm==4) + ul_ch_mag128[re] = vdupq_n_s16(324); // this is 512*2/sqrt(10) + else { + ul_ch_mag128[re] = vdupq_n_s16(316); // this is 512*4/sqrt(42) + ul_ch_magb128[re] = vdupq_n_s16(158); // this is 512*2/sqrt(42) + } +#endif // printf("(%d,%d)\n",*(int16_t*)&(rxdataF_comp128[re]),*(1+(int16_t*)&(rxdataF_comp128[re]))); } diff --git a/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c index 27e9018477..6e853e6c58 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c @@ -49,7 +49,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, unsigned char nu,aarx; unsigned short k; unsigned int rb,pilot_cnt; - short ch[2],*pil,*rxF,*dl_ch,*dl_ch_prev,*f,*f2,*fl,*f2l2,*fr,*f2r2,*f2_dc,*f_dc; + int16_t ch[2],*pil,*rxF,*dl_ch,*dl_ch_prev,*f,*f2,*fl,*f2l2,*fr,*f2r2,*f2_dc,*f_dc; int ch_offset,symbol_offset; // unsigned int n; // int i; @@ -192,14 +192,13 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, for (aarx=0; aarx<phy_vars_ue->lte_frame_parms.nb_antennas_rx; aarx++) { - pil = (short *)&pilot[p][0]; - rxF = (short *)&rxdataF[aarx][((symbol_offset+k+phy_vars_ue->lte_frame_parms.first_carrier_offset))]; - dl_ch = (short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]; + pil = (int16_t *)&pilot[p][0]; + rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+k+phy_vars_ue->lte_frame_parms.first_carrier_offset))]; + dl_ch = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]; // if (eNb_id==0) memset(dl_ch,0,4*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)); - if (phy_vars_ue->high_speed_flag==0) // multiply previous channel estimate by ch_est_alpha multadd_complex_vector_real_scalar(dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1), phy_vars_ue->ch_est_alpha,dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1), @@ -212,8 +211,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, //First half of pilots // Treat first 2 pilots specially (left edge) - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); // printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); multadd_real_vector_complex_scalar(fl, ch, @@ -223,8 +222,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, rxF+=12; dl_ch+=8; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); // printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); multadd_real_vector_complex_scalar(f2l2, ch, @@ -236,15 +235,15 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, for (pilot_cnt=2; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-1); pilot_cnt+=2) { - // printf("%d\n",dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); + // printf("%d\n",dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); // printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]); // printf("rx[%d] -> (%d,%d)\n", k, rxF[0], rxF[1]); - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); //Re - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); //Im - // printf("**rb %d %d\n",rb,dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); //Re + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); //Im + // printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); multadd_real_vector_complex_scalar(f, ch, dl_ch, @@ -259,9 +258,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, // printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]); - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); - // printf("**rb %d %d\n",rb,dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); + // printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); multadd_real_vector_complex_scalar(f2, ch, dl_ch, @@ -280,17 +279,17 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, if (k > 6) k -=6; - rxF = (short *)&rxdataF[aarx][((symbol_offset+1+k))]; + rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))]; for (pilot_cnt=0; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-3); pilot_cnt+=2) { // printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]); // printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]); - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); - // printf("**rb %d %d\n",rb,dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); + // printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); multadd_real_vector_complex_scalar(f, ch, dl_ch, @@ -299,10 +298,10 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, rxF+=12; dl_ch+=8; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); - // printf("**rb %d %d\n",rb,dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); + // printf("**rb %d %d\n",rb,dl_ch-(int16_T *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); multadd_real_vector_complex_scalar(f2, ch, dl_ch, @@ -313,8 +312,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, } - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); // printf("pilot 49: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); multadd_real_vector_complex_scalar(fr, @@ -325,8 +324,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, rxF+=12; dl_ch+=8; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); // printf("pilot 50: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); multadd_real_vector_complex_scalar(f2r2, ch, @@ -340,8 +339,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, //printf("Channel estimation\n"); // Treat first 2 pilots specially (left edge) - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); @@ -358,8 +357,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, rxF+=12; dl_ch+=8; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); @@ -381,8 +380,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, // printf("pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]); // printf("rx[%d][%d] -> (%d,%d)\n",p,phy_vars_ue->lte_frame_parms.first_carrier_offset + phy_vars_ue->lte_frame_parms.nushift + 6*rb+(3*p),rxF[0],rxF[1]); - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); @@ -400,8 +399,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, rxF+=12; dl_ch+=8; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); @@ -419,8 +418,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, } - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH printf("pilot 24: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); @@ -438,10 +437,10 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, // printf("Second half\n"); // Second half of RBs - rxF = (short *)&rxdataF[aarx][((symbol_offset+1+k))]; + rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))]; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH printf("pilot 25: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); @@ -462,8 +461,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, // printf("* pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]); // printf("rx[%d][%d] -> (%d,%d)\n",p,phy_vars_ue->lte_frame_parms.first_carrier_offset + phy_vars_ue->lte_frame_parms.nushift + 6*rb+(3*p),rxF[0],rxF[1]); - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH printf("pilot %d rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",26+pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); @@ -479,8 +478,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, rxF+=12; dl_ch+=8; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH printf("pilot %d : rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",27+pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); @@ -498,8 +497,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, } - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH printf("pilot 49: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); @@ -517,8 +516,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, rxF+=12; dl_ch+=8; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); #ifdef DEBUG_CH @@ -544,8 +543,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, // phy_vars_ue->lte_frame_parms.first_carrier_offset + phy_vars_ue->lte_frame_parms.nushift + 6*rb+(3*p), // rxF[0], // rxF[1]); - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); //printf("ch -> (%d,%d)\n",ch[0],ch[1]); multadd_real_vector_complex_scalar(f, ch, @@ -555,8 +554,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, rxF+=12; dl_ch+=8; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); //printf("ch -> (%d,%d)\n",ch[0],ch[1]); multadd_real_vector_complex_scalar(f2, ch, @@ -568,8 +567,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, } - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); // printf("ch -> (%d,%d)\n",ch[0],ch[1]); multadd_real_vector_complex_scalar(f, ch, @@ -580,9 +579,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, //printf("Second half\n"); //Second half of RBs - rxF = (short *)&rxdataF[aarx][((symbol_offset+1+nushift + (3*p)))]; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+1+nushift + (3*p)))]; + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); multadd_real_vector_complex_scalar(f2, ch, @@ -599,8 +598,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, // phy_vars_ue->lte_frame_parms.first_carrier_offset + phy_vars_ue->lte_frame_parms.nushift + 6*rb+(3*p), // rxF[0], // rxF[1]); - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); multadd_real_vector_complex_scalar(f, ch, @@ -610,8 +609,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, rxF+=12; dl_ch+=8; - ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); - ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); + ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); + ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); multadd_real_vector_complex_scalar(f2, ch, @@ -631,7 +630,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, // Temporal Interpolation // printf("ch_offset %d\n",ch_offset); - dl_ch = (short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]; + dl_ch = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]; if (phy_vars_ue->high_speed_flag == 0) { multadd_complex_vector_real_scalar(dl_ch, @@ -640,8 +639,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, } else { // high_speed_flag == 1 if (symbol == 0) { // printf("Interpolating %d->0\n",4-phy_vars_ue->lte_frame_parms.Ncp); - // dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][(4-phy_vars_ue->lte_frame_parms.Ncp)*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)]; - dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][pilot3*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)]; + // dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][(4-phy_vars_ue->lte_frame_parms.Ncp)*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)]; + dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][pilot3*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)]; multadd_complex_vector_real_scalar(dl_ch_prev,21845,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); @@ -650,7 +649,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, multadd_complex_vector_real_scalar(dl_ch,21845,dl_ch_prev+(2*((phy_vars_ue->lte_frame_parms.ofdm_symbol_size)<<1)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); } // this is 1/3,2/3 combination for pilots spaced by 3 symbols else if (symbol == pilot1) { - dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][0]; + dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][0]; if (phy_vars_ue->lte_frame_parms.Ncp==0) {// pilot spacing 4 symbols (1/4,1/2,3/4 combination) multadd_complex_vector_real_scalar(dl_ch_prev,24576,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); @@ -669,7 +668,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*((phy_vars_ue->lte_frame_parms.ofdm_symbol_size)<<1)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); } // pilot spacing 3 symbols (1/3,2/3 combination) } else if (symbol == pilot2) { - dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][pilot1*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)]; + dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][pilot1*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)]; multadd_complex_vector_real_scalar(dl_ch_prev,21845,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); @@ -678,7 +677,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, multadd_complex_vector_real_scalar(dl_ch,21845,dl_ch_prev+(2*((phy_vars_ue->lte_frame_parms.ofdm_symbol_size)<<1)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); } else { // symbol == pilot3 // printf("Interpolating 0->%d\n",4-phy_vars_ue->lte_frame_parms.Ncp); - dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][pilot2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)]; + dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][pilot2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)]; if (phy_vars_ue->lte_frame_parms.Ncp==0) {// pilot spacing 4 symbols (1/4,1/2,3/4 combination) multadd_complex_vector_real_scalar(dl_ch_prev,24576,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); diff --git a/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c b/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c index 77e5be8500..b11ba532f5 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c @@ -35,7 +35,11 @@ #include "PHY/defs.h" //#define DEBUG_PHY +#if defined(__x86_64__) || defined(__i386__) __m128i avg128F; +#elif defined(__arm__) +int32x4_t avg128F; +#endif //compute average channel_level on each (TX,RX) antenna pair int dl_channel_level(int16_t *dl_ch, @@ -43,10 +47,15 @@ int dl_channel_level(int16_t *dl_ch, { int16_t rb; +#if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch128; +#elif defined(__arm__) + int16x4_t *dl_ch128; +#endif int avg; //clear average level +#if defined(__x86_64__) || defined(__i386__) avg128F = _mm_setzero_si128(); dl_ch128=(__m128i *)dl_ch; @@ -59,7 +68,25 @@ int dl_channel_level(int16_t *dl_ch, dl_ch128+=3; } +#elif defined(__arm__) + avg128F = vdupq_n_s32(0); + dl_ch128=(int16x4_t *)dl_ch; + for (rb=0; rb<frame_parms->N_RB_DL; rb++) { + + avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[0],dl_ch128[0])); + avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[1],dl_ch128[1])); + avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[2],dl_ch128[2])); + avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[3],dl_ch128[3])); + avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[4],dl_ch128[4])); + avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[5],dl_ch128[5])); + dl_ch128+=6; + + + } + + +#endif DevAssert( frame_parms->N_RB_DL ); avg = (((int*)&avg128F)[0] + ((int*)&avg128F)[1] + @@ -67,10 +94,10 @@ int dl_channel_level(int16_t *dl_ch, ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12); - +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif return(avg); } diff --git a/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c b/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c index 6a5a6eb3c1..37f5d26315 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c @@ -46,14 +46,15 @@ //#include "defs.h" #include "PHY/defs.h" #include "PHY/extern.h" -#include "pss6144.h" - +#if defined(__x86_64__) || defined(__i386__) +#include "pss6144.h" extern void print_shorts(char*,__m128i*); +#endif void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq) { - +#if defined(__x86_64__) || defined(__i386__) UE_SCAN_INFO_t *scan_info = &ue->scan_info[band]; int16_t spectrum[12288] __attribute__((aligned(16))); int16_t spectrum_p5ms[12288] __attribute__((aligned(16))); @@ -358,5 +359,6 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq) for (band_idx=0; band_idx<10; band_idx++) printf("pss 2: level %d dB, freq %u\n", dB_fixed(scan_info->amp[2][band_idx]),scan_info->freq_offset_Hz[2][band_idx]); +#endif } diff --git a/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c b/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c index dbd385dbdc..dcef22dc99 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c @@ -42,34 +42,26 @@ //#define DEBUG_MEAS #ifdef USER_MODE -void print_shorts(char *s,__m128i *x) +void print_shorts(char *s,short *x) { - short *tempb = (short *)x; printf("%s : %d,%d,%d,%d,%d,%d,%d,%d\n",s, - tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7] + x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7] ); } -void print_ints(char *s,__m128i *x) +void print_ints(char *s,int *x) { - int *tempb = (int *)x; printf("%s : %d,%d,%d,%d\n",s, - tempb[0],tempb[1],tempb[2],tempb[3] + x[0],x[1],x[2],x[3] ); } #endif -__m128i pmi128_re __attribute__ ((aligned(16))); -__m128i pmi128_im __attribute__ ((aligned(16))); -__m128i mmtmpPMI0 __attribute__ ((aligned(16))); -__m128i mmtmpPMI1 __attribute__ ((aligned(16))); -__m128i mmtmpPMI2 __attribute__ ((aligned(16))); -__m128i mmtmpPMI3 __attribute__ ((aligned(16))); int16_t get_PL(uint8_t Mod_id,uint8_t CC_id,uint8_t eNB_index) { @@ -421,7 +413,11 @@ void lte_ue_measurements(PHY_VARS_UE *phy_vars_ue, //int rx_power[NUMBER_OF_CONNECTED_eNB_MAX]; int i; unsigned int limit,subband; +#if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch0_128,*dl_ch1_128; +#elif defined(__arm__) + int16x8_t *dl_ch0_128, *dl_ch1_128; +#endif int *dl_ch0,*dl_ch1; LTE_DL_FRAME_PARMS *frame_parms = &phy_vars_ue->lte_frame_parms; int nb_subbands,subband_size,last_subband_size; @@ -605,26 +601,30 @@ void lte_ue_measurements(PHY_VARS_UE *phy_vars_ue, for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { // skip the first 4 RE due to interpolation filter length of 5 (not possible to skip 5 due to 128i alignment, must be multiple of 128bit) + +#if defined(__x86_64__) || defined(__i386__) + __m128i pmi128_re,pmi128_im,mmtmpPMI0,mmtmpPMI1,mmtmpPMI2,mmtmpPMI3; + dl_ch0_128 = (__m128i *)&phy_vars_ue->lte_ue_common_vars.dl_ch_estimates[eNB_id][aarx][4]; dl_ch1_128 = (__m128i *)&phy_vars_ue->lte_ue_common_vars.dl_ch_estimates[eNB_id][2+aarx][4]; +#elif defined(__arm__) + int32x4_t pmi128_re,pmi128_im,mmtmpPMI0,mmtmpPMI1,mmtmpPMI0b,mmtmpPMI1b; - /* - #ifdef DEBUG_PHY - if(eNB_id==0){ - print_shorts("Ch0",dl_ch0_128); - print_shorts("Ch1",dl_ch1_128); - printf("eNB_ID = %d\n",eNB_id); - } - #endif - */ + dl_ch0_128 = (int16x8_t *)&phy_vars_ue->lte_ue_common_vars.dl_ch_estimates[eNB_id][aarx][4]; + dl_ch1_128 = (int16x8_t *)&phy_vars_ue->lte_ue_common_vars.dl_ch_estimates[eNB_id][2+aarx][4]; + +#endif for (subband=0; subband<nb_subbands; subband++) { // pmi - +#if defined(__x86_64__) || defined(__i386__) pmi128_re = _mm_setzero_si128(); pmi128_im = _mm_setzero_si128(); - +#elif defined(__arm__) + pmi128_re = vdupq_n_s32(0); + pmi128_im = vdupq_n_s32(0); +#endif // limit is the number of groups of 4 REs in a subband (12 = 4 RBs, 3 = 1 RB) // for 5 MHz channelization, there are 7 subbands, 6 of size 4 RBs and 1 of size 1 RB if ((N_RB_DL==6) || (subband<(nb_subbands-1))) @@ -636,52 +636,33 @@ void lte_ue_measurements(PHY_VARS_UE *phy_vars_ue, // For each RE in subband perform ch0 * conj(ch1) // multiply by conjugated channel - // if(eNB_id==0){ - //print_shorts("ch0",dl_ch0_128); - //print_shorts("ch1",dl_ch1_128); - // } - // if(i==0){ - mmtmpPMI0 = _mm_setzero_si128(); - mmtmpPMI1 = _mm_setzero_si128(); - // } - // if(eNB_id==0) - // print_ints("Pre_re",&mmtmpPMI0); - - mmtmpPMI0 = _mm_madd_epi16(dl_ch0_128[0],dl_ch1_128[0]); - // if(eNB_id==0) - // print_ints("re",&mmtmpPMI0); - - // mmtmpPMI0 contains real part of 4 consecutive outputs (32-bit) - // print_shorts("Ch1",dl_ch1_128); - +#if defined(__x86_64__) || defined(__i386__) mmtmpPMI1 = _mm_shufflelo_epi16(dl_ch1_128[0],_MM_SHUFFLE(2,3,0,1));//_MM_SHUFFLE(2,3,0,1) - // print_shorts("mmtmpPMI1:",&mmtmpPMI1); mmtmpPMI1 = _mm_shufflehi_epi16(mmtmpPMI1,_MM_SHUFFLE(2,3,0,1)); - // print_shorts("mmtmpPMI1:",&mmtmpPMI1); - mmtmpPMI1 = _mm_sign_epi16(mmtmpPMI1,*(__m128i*)&conjugate[0]); - // print_shorts("mmtmpPMI1:",&mmtmpPMI1); mmtmpPMI1 = _mm_madd_epi16(mmtmpPMI1,dl_ch0_128[0]); - // if(eNB_id==0) - // print_ints("im",&mmtmpPMI1); // mmtmpPMI1 contains imag part of 4 consecutive outputs (32-bit) pmi128_re = _mm_add_epi32(pmi128_re,mmtmpPMI0); pmi128_im = _mm_add_epi32(pmi128_im,mmtmpPMI1); +#elif defined(__arm__) + mmtmpPMI0 = vmull_s16(((int16x4_t*)dl_ch0_128)[0], ((int16x4_t*)dl_ch1_128)[0]); + mmtmpPMI1 = vmull_s16(((int16x4_t*)dl_ch0_128)[1], ((int16x4_t*)dl_ch1_128)[1]); + pmi128_re = vqaddq_s32(pmi128_re,vcombine_s32(vpadd_s32(vget_low_s32(mmtmpPMI0),vget_high_s32(mmtmpPMI0)),vpadd_s32(vget_low_s32(mmtmpPMI1),vget_high_s32(mmtmpPMI1)))); + + mmtmpPMI0b = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)dl_ch0_128)[0],*(int16x4_t*)conjugate)), ((int16x4_t*)dl_ch1_128)[0]); + mmtmpPMI1b = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)dl_ch0_128)[1],*(int16x4_t*)conjugate)), ((int16x4_t*)dl_ch1_128)[1]); + pmi128_im = vqaddq_s32(pmi128_im,vcombine_s32(vpadd_s32(vget_low_s32(mmtmpPMI0b),vget_high_s32(mmtmpPMI0b)),vpadd_s32(vget_low_s32(mmtmpPMI1b),vget_high_s32(mmtmpPMI1b)))); + +#endif dl_ch0_128++; dl_ch1_128++; } phy_vars_ue->PHY_measurements.subband_pmi_re[eNB_id][subband][aarx] = (((int *)&pmi128_re)[0] + ((int *)&pmi128_re)[1] + ((int *)&pmi128_re)[2] + ((int *)&pmi128_re)[3])>>2; - // if(eNB_id==0) - // printf("in lte_ue_measurements.c: pmi_re %d\n",phy_vars_ue->PHY_measurements.subband_pmi_re[eNB_id][subband][aarx]); phy_vars_ue->PHY_measurements.subband_pmi_im[eNB_id][subband][aarx] = (((int *)&pmi128_im)[0] + ((int *)&pmi128_im)[1] + ((int *)&pmi128_im)[2] + ((int *)&pmi128_im)[3])>>2; - // if(eNB_id==0) - // printf("in lte_ue_measurements.c: pmi_im %d\n",phy_vars_ue->PHY_measurements.subband_pmi_im[eNB_id][subband][aarx]); phy_vars_ue->PHY_measurements.wideband_pmi_re[eNB_id][aarx] += phy_vars_ue->PHY_measurements.subband_pmi_re[eNB_id][subband][aarx]; phy_vars_ue->PHY_measurements.wideband_pmi_im[eNB_id][aarx] += phy_vars_ue->PHY_measurements.subband_pmi_im[eNB_id][subband][aarx]; - // msg("subband_pmi[%d][%d][%d] => (%d,%d)\n",eNB_id,subband,aarx,phy_vars_ue->PHY_measurements.subband_pmi_re[eNB_id][subband][aarx],phy_vars_ue->PHY_measurements.subband_pmi_im[eNB_id][subband][aarx]); - } // subband loop } // rx antenna loop } // if frame_parms->mode1_flag == 0 @@ -742,9 +723,10 @@ void lte_ue_measurements(PHY_VARS_UE *phy_vars_ue, // printf("in lte_ue_measurements: selected rx_antenna[eNB_id==0]:%u\n", phy_vars_ue->PHY_measurements.selected_rx_antennas[eNB_id][i]); } // eNB_id loop +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } diff --git a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c index a569ddfc71..9a47a6b355 100644 --- a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c +++ b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c @@ -106,9 +106,13 @@ int32_t lte_ul_channel_estimation(PHY_VARS_eNB *phy_vars_eNB, *temp_out_fft_1_ptr = (int32_t*)0,*out_fft_ptr_1 = (int32_t*)0, *temp_in_ifft_ptr = (int32_t*)0; +#if defined(__x86_64__) || defined(__i386__) __m128i *rxdataF128,*ul_ref128,*ul_ch128; __m128i mmtmpU0,mmtmpU1,mmtmpU2,mmtmpU3; - +#elif defined(__arm__) + int16x8_t *rxdataF128,*ul_ref128,*ul_ch128; + int32x4_t mmtmp0,mmtmp1,mmtmp_re,mmtmp_im; +#endif Msc_RS = N_rb_alloc*12; cyclic_shift = (frame_parms->pusch_config_common.ul_ReferenceSignalsPUSCH.cyclicShift + @@ -156,11 +160,18 @@ int32_t lte_ul_channel_estimation(PHY_VARS_eNB *phy_vars_eNB, for (aa=0; aa<nb_antennas_rx; aa++) { // msg("Componentwise prod aa %d, symbol_offset %d,ul_ch_estimates %p,ul_ch_estimates[aa] %p,ul_ref_sigs_rx[0][0][Msc_RS_idx] %p\n",aa,symbol_offset,ul_ch_estimates,ul_ch_estimates[aa],ul_ref_sigs_rx[0][0][Msc_RS_idx]); +#if defined(__x86_64__) || defined(__i386__) rxdataF128 = (__m128i *)&rxdataF_ext[aa][symbol_offset]; ul_ch128 = (__m128i *)&ul_ch_estimates[aa][symbol_offset]; ul_ref128 = (__m128i *)ul_ref_sigs_rx[u][v][Msc_RS_idx]; +#elif defined(__arm__) + rxdataF128 = (int16x8_t *)&rxdataF_ext[aa][symbol_offset]; + ul_ch128 = (int16x8_t *)&ul_ch_estimates[aa][symbol_offset]; + ul_ref128 = (int16x8_t *)ul_ref_sigs_rx[u][v][Msc_RS_idx]; +#endif for (i=0; i<Msc_RS/12; i++) { +#if defined(__x86_64__) || defined(__i386__) // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ref128[0],rxdataF128[0]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) @@ -204,7 +215,50 @@ int32_t lte_ul_channel_estimation(PHY_VARS_eNB *phy_vars_eNB, mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); ul_ch128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); +#elif defined(__arm__) + mmtmp0 = vmull_s16(((int16x4_t*)ul_ref128)[0],((int16x4_t*)rxdataF128)[0]); + mmtmp1 = vmull_s16(((int16x4_t*)ul_ref128)[1],((int16x4_t*)rxdataF128)[1]); + mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[0],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[0]); + mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[1],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[1]); + mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + + ul_ch128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im)); + ul_ch128++; + ul_ref128++; + rxdataF128++; + mmtmp0 = vmull_s16(((int16x4_t*)ul_ref128)[0],((int16x4_t*)rxdataF128)[0]); + mmtmp1 = vmull_s16(((int16x4_t*)ul_ref128)[1],((int16x4_t*)rxdataF128)[1]); + mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[0],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[0]); + mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[1],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[1]); + mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + + ul_ch128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im)); + ul_ch128++; + ul_ref128++; + rxdataF128++; + + mmtmp0 = vmull_s16(((int16x4_t*)ul_ref128)[0],((int16x4_t*)rxdataF128)[0]); + mmtmp1 = vmull_s16(((int16x4_t*)ul_ref128)[1],((int16x4_t*)rxdataF128)[1]); + mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[0],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[0]); + mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[1],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[1]); + mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + + ul_ch128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im)); + ul_ch128++; + ul_ref128++; + rxdataF128++; + +#endif ul_ch128+=3; ul_ref128+=3; rxdataF128+=3; @@ -538,17 +592,17 @@ int32_t lte_ul_channel_estimation(PHY_VARS_eNB *phy_vars_eNB, // msg("sym: %d, current_phase1: %d, ru: %d + j%d, current_phase2: %d, ru: %d + j%d\n",k,current_phase1,ru1[2*current_phase1],ru1[2*current_phase1+1],current_phase2,ru2[2*current_phase2],ru2[2*current_phase2+1]); // rotate channel estimates by estimated phase - rotate_cpx_vector_norep((int16_t*) ul_ch1, - &ru1[2*current_phase1], - (int16_t*) &ul_ch_estimates[aa][frame_parms->N_RB_UL*12*k], - Msc_RS, - 15); - - rotate_cpx_vector_norep((int16_t*) ul_ch2, - &ru2[2*current_phase2], - (int16_t*) &tmp_estimates[0], - Msc_RS, - 15); + rotate_cpx_vector((int16_t*) ul_ch1, + &ru1[2*current_phase1], + (int16_t*) &ul_ch_estimates[aa][frame_parms->N_RB_UL*12*k], + Msc_RS, + 15); + + rotate_cpx_vector((int16_t*) ul_ch2, + &ru2[2*current_phase2], + (int16_t*) &tmp_estimates[0], + Msc_RS, + 15); // Combine the two rotated estimates multadd_complex_vector_real_scalar((int16_t*) &ul_ch_estimates[aa][frame_parms->N_RB_UL*12*k],SCALE,(int16_t*) &ul_ch_estimates[aa][frame_parms->N_RB_UL*12*k],1,Msc_RS); @@ -664,11 +718,11 @@ int32_t lte_srs_channel_estimation(LTE_DL_FRAME_PARMS *frame_parms, //write_output("eNb_rxF.m","rxF",&eNb_common_vars->rxdataF[0][aa][2*frame_parms->ofdm_symbol_size*symbol],2*(frame_parms->ofdm_symbol_size),2,1); //write_output("eNb_srs.m","srs_eNb",eNb_common_vars->srs,(frame_parms->ofdm_symbol_size),1,1); - mult_cpx_vector_norep((int16_t*) &eNb_common_vars->rxdataF[eNb_id][aa][2*frame_parms->ofdm_symbol_size*symbol], - (int16_t*) eNb_srs_vars->srs, - (int16_t*) eNb_srs_vars->srs_ch_estimates[eNb_id][aa], - frame_parms->ofdm_symbol_size, - 15); + mult_cpx_conj_vector((int16_t*) &eNb_common_vars->rxdataF[eNb_id][aa][2*frame_parms->ofdm_symbol_size*symbol], + (int16_t*) eNb_srs_vars->srs, + (int16_t*) eNb_srs_vars->srs_ch_estimates[eNb_id][aa], + frame_parms->ofdm_symbol_size, + 15); //msg("SRS channel estimation cmult out\n"); #ifdef USER_MODE @@ -695,6 +749,7 @@ int16_t lte_ul_freq_offset_estimation(LTE_DL_FRAME_PARMS *frame_parms, uint16_t nb_rb) { +#if defined(__x86_64__) || defined(__i386__) int k, rb; int a_idx = 64; uint8_t conj_flag = 0; @@ -830,4 +885,7 @@ int16_t lte_ul_freq_offset_estimation(LTE_DL_FRAME_PARMS *frame_parms, phase_idx = -phase_idx; return(phase_idx); +#elif defined(__arm__) + return(0); +#endif } diff --git a/openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c b/openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c index 4704e76b23..1a09c83372 100644 --- a/openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c +++ b/openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c @@ -94,9 +94,9 @@ int lte_dl_cell_spec_SS(PHY_VARS_eNB *phy_vars_eNB, output[k] = qpsk[(phy_vars_eNB->lte_gold_table[Ns][l][mprime_dword]>>(2*mprime_qpsk_symb))&3]; //output[k] = (lte_gold_table[eNB_offset][Ns][l][mprime_dword]>>(2*mprime_qpsk_symb))&3; #ifdef DEBUG_DL_CELL_SPEC - debug_msg("Ns %d, l %d, m %d,mprime_dword %d, mprime_qpsk_symbol %d\n", + msg("Ns %d, l %d, m %d,mprime_dword %d, mprime_qpsk_symbol %d\n", Ns,l,m,mprime_dword,mprime_qpsk_symb); - debug_msg("index = %d (k %d)\n",(phy_vars_eNB->lte_gold_table[Ns][l][mprime_dword]>>(2*mprime_qpsk_symb))&3,k); + msg("index = %d (k %d)\n",(phy_vars_eNB->lte_gold_table[Ns][l][mprime_dword]>>(2*mprime_qpsk_symb))&3,k); #endif mprime++; diff --git a/openair1/PHY/LTE_TRANSPORT/dci.c b/openair1/PHY/LTE_TRANSPORT/dci.c index f46e43a58a..2e485e66fa 100644 --- a/openair1/PHY/LTE_TRANSPORT/dci.c +++ b/openair1/PHY/LTE_TRANSPORT/dci.c @@ -560,10 +560,10 @@ int32_t pdcch_qpsk_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, uint8_t symbol) { - __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; - __m128i *rxF_i=(__m128i*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)]; - __m128i *rho=(__m128i*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)]; - __m128i *llr128; + int16_t *rxF=(int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; + int16_t *rxF_i=(int16_t*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)]; + int16_t *rho=(int16_t*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)]; + int16_t *llr128; int32_t i; char *pdcch_llr8; int16_t *pdcch_llr; @@ -572,17 +572,17 @@ int32_t pdcch_qpsk_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, // printf("dlsch_qpsk_qpsk: symbol %d\n",symbol); - llr128 = (__m128i*)pdcch_llr; + llr128 = (int16_t*)pdcch_llr; if (!llr128) { msg("dlsch_qpsk_qpsk_llr: llr is null, symbol %d\n",symbol); return -1; } - qpsk_qpsk((int16_t *)rxF, - (int16_t *)rxF_i, - (int16_t *)llr128, - (int16_t *)rho, + qpsk_qpsk(rxF, + rxF_i, + llr128, + rho, frame_parms->N_RB_DL*12); //prepare for Viterbi which accepts 8 bit, but prefers 4 bit, soft input. @@ -639,7 +639,7 @@ int32_t pdcch_llr(LTE_DL_FRAME_PARMS *frame_parms, } -__m128i avg128P; +//__m128i avg128P; //compute average channel_level on each (TX,RX) antenna pair void pdcch_channel_level(int32_t **dl_ch_estimates_ext, @@ -650,21 +650,31 @@ void pdcch_channel_level(int32_t **dl_ch_estimates_ext, int16_t rb; uint8_t aatx,aarx; +#if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch128; - - + __m128i avg128P; +#elif defined(__arm__) + int16x8_t *dl_ch128; + int32x4_t *avg128P; +#endif for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { //clear average level +#if defined(__x86_64__) || defined(__i386__) avg128P = _mm_setzero_si128(); dl_ch128=(__m128i *)&dl_ch_estimates_ext[(aatx<<1)+aarx][frame_parms->N_RB_DL*12]; +#elif defined(__arm__) +#endif for (rb=0; rb<nb_rb; rb++) { +#if defined(__x86_64__) || defined(__i386__) avg128P = _mm_add_epi32(avg128P,_mm_madd_epi16(dl_ch128[0],dl_ch128[0])); avg128P = _mm_add_epi32(avg128P,_mm_madd_epi16(dl_ch128[1],dl_ch128[1])); avg128P = _mm_add_epi32(avg128P,_mm_madd_epi16(dl_ch128[2],dl_ch128[2])); +#elif defined(__arm__) +#endif dl_ch128+=3; /* if (rb==0) { @@ -684,13 +694,18 @@ void pdcch_channel_level(int32_t **dl_ch_estimates_ext, // msg("Channel level : %d\n",avg[(aatx<<1)+aarx]); } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } +#if defined(__x86_64) || defined(__i386__) __m128i mmtmpPD0,mmtmpPD1,mmtmpPD2,mmtmpPD3; +#elif defined(__arm__) +#endif void pdcch_dual_stream_correlation(LTE_DL_FRAME_PARMS *frame_parms, uint8_t symbol, int32_t **dl_ch_estimates_ext, @@ -700,7 +715,11 @@ void pdcch_dual_stream_correlation(LTE_DL_FRAME_PARMS *frame_parms, { uint16_t rb; +#if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch128,*dl_ch128i,*dl_ch_rho128; +#elif defined(__arm__) + +#endif uint8_t aarx; // printf("dlsch_dual_stream_correlation: symbol %d\n",symbol); @@ -708,13 +727,18 @@ void pdcch_dual_stream_correlation(LTE_DL_FRAME_PARMS *frame_parms, for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { +#if defined(__x86_64__) || defined(__i386__) dl_ch128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch128i = (__m128i *)&dl_ch_estimates_ext_i[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_rho128 = (__m128i *)&dl_ch_rho_ext[aarx][symbol*frame_parms->N_RB_DL*12]; +#elif defined(__arm__) + +#endif for (rb=0; rb<frame_parms->N_RB_DL; rb++) { // multiply by conjugated channel +#if defined(__x86_64__) || defined(__i386__) mmtmpPD0 = _mm_madd_epi16(dl_ch128[0],dl_ch128i[0]); // print_ints("re",&mmtmpPD0); @@ -779,13 +803,16 @@ void pdcch_dual_stream_correlation(LTE_DL_FRAME_PARMS *frame_parms, dl_ch128i+=3; dl_ch_rho128+=3; - } - } +#elif defined(__arm__) +#endif + } + } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } @@ -800,44 +827,78 @@ void pdcch_detection_mrc_i(LTE_DL_FRAME_PARMS *frame_parms, uint8_t aatx; +#if defined(__x86_64__) || defined(__i386__) __m128i *rxdataF_comp128_0,*rxdataF_comp128_1,*rxdataF_comp128_i0,*rxdataF_comp128_i1,*rho128_0,*rho128_1,*rho128_i0,*rho128_i1; +#elif defined(__arm__) + int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1,*rxdataF_comp128_i0,*rxdataF_comp128_i1,*rho128_0,*rho128_1,*rho128_i0,*rho128_i1; +#endif int32_t i; if (frame_parms->nb_antennas_rx>1) { for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) { //if (frame_parms->mode1_flag && (aatx>0)) break; +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128_0 = (__m128i *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12]; - +#elif defined(__arm__) + rxdataF_comp128_0 = (int16x8_t *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128_1 = (int16x8_t *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12]; +#endif // MRC on each re of rb on MF output for (i=0; i<frame_parms->N_RB_DL*3; i++) { +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1)); +#elif defined(__arm__) + rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]); +#endif } } +#if defined(__x86_64__) || defined(__i386__) rho128_0 = (__m128i *) &rho[0][symbol*frame_parms->N_RB_DL*12]; rho128_1 = (__m128i *) &rho[1][symbol*frame_parms->N_RB_DL*12]; - +#elif defined(__arm__) + rho128_0 = (int16x8_t *) &rho[0][symbol*frame_parms->N_RB_DL*12]; + rho128_1 = (int16x8_t *) &rho[1][symbol*frame_parms->N_RB_DL*12]; +#endif for (i=0; i<frame_parms->N_RB_DL*3; i++) { +#if defined(__x86_64__) || defined(__i386__) rho128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rho128_0[i],1),_mm_srai_epi16(rho128_1[i],1)); +#elif defined(__arm__) + rho128_0[i] = vhaddq_s16(rho128_0[i],rho128_1[i]); +#endif } +#if defined(__x86_64__) || defined(__i386__) rho128_i0 = (__m128i *) &rho_i[0][symbol*frame_parms->N_RB_DL*12]; rho128_i1 = (__m128i *) &rho_i[1][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_i0 = (__m128i *)&rxdataF_comp_i[0][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_i1 = (__m128i *)&rxdataF_comp_i[1][symbol*frame_parms->N_RB_DL*12]; +#elif defined(__arm__) + rho128_i0 = (int16x8_t*) &rho_i[0][symbol*frame_parms->N_RB_DL*12]; + rho128_i1 = (int16x8_t*) &rho_i[1][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128_i0 = (int16x8_t *)&rxdataF_comp_i[0][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128_i1 = (int16x8_t *)&rxdataF_comp_i[1][symbol*frame_parms->N_RB_DL*12]; +#endif // MRC on each re of rb on MF and rho for (i=0; i<frame_parms->N_RB_DL*3; i++) { +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128_i0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_i0[i],1),_mm_srai_epi16(rxdataF_comp128_i1[i],1)); rho128_i0[i] = _mm_adds_epi16(_mm_srai_epi16(rho128_i0[i],1),_mm_srai_epi16(rho128_i1[i],1)); +#elif defined(__arm__) + rxdataF_comp128_i0[i] = vhaddq_s16(rxdataF_comp128_i0[i],rxdataF_comp128_i1[i]); + rho128_i0[i] = vhaddq_s16(rho128_i0[i],rho128_i1[i]); + +#endif } } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } @@ -1056,10 +1117,6 @@ void pdcch_extract_rbs_single(int32_t **rxdataF, } } } - - _mm_empty(); - _m_empty(); - } void pdcch_extract_rbs_dual(int32_t **rxdataF, @@ -1310,11 +1367,6 @@ void pdcch_extract_rbs_dual(int32_t **rxdataF, } } } - - _mm_empty(); - _m_empty(); - - } @@ -1328,8 +1380,12 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext, { uint16_t rb; +#if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch128,*rxdataF128,*rxdataF_comp128; __m128i *dl_ch128_2, *rho128; +#elif defined(__arm__) + +#endif uint8_t aatx,aarx,pilots=0; @@ -1347,13 +1403,18 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext, for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { +#if defined(__x86_64__) || defined(__i386__) dl_ch128 = (__m128i *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12]; +#elif defined(__arm__) + +#endif for (rb=0; rb<frame_parms->N_RB_DL; rb++) { +#if defined(__x86_64__) || defined(__i386__) // multiply by conjugated channel mmtmpPD0 = _mm_madd_epi16(dl_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpPD0); @@ -1426,6 +1487,9 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext, rxdataF128+=2; rxdataF_comp128+=2; } +#elif defined(__arm__) + +#endif } } } @@ -1434,11 +1498,18 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext, if (rho) { for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { + +#if defined(__x86_64__) || defined(__i386__) rho128 = (__m128i *)&rho[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch128_2 = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; +#elif defined(__arm__) + +#endif for (rb=0; rb<frame_parms->N_RB_DL; rb++) { +#if defined(__x86_64__) || defined(__i386__) + // multiply by conjugated channel mmtmpPD0 = _mm_madd_epi16(dl_ch128[0],dl_ch128_2[0]); // print_ints("re",&mmtmpD0); @@ -1504,14 +1575,19 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext, dl_ch128_2+=3; rho128+=3; +#elif defined(__arm_) + + +#endif } } } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } void pdcch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, @@ -1521,23 +1597,37 @@ void pdcch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, uint8_t aatx; +#if defined(__x86_64__) || defined(__i386__) __m128i *rxdataF_comp128_0,*rxdataF_comp128_1; +#elif defined(__arm__) + int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1; +#endif int32_t i; if (frame_parms->nb_antennas_rx>1) { for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) { +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128_0 = (__m128i *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12]; - +#elif defined(__arm__) + rxdataF_comp128_0 = (int16x8_t *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128_1 = (int16x8_t *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12]; +#endif // MRC on each re of rb for (i=0; i<frame_parms->N_RB_DL*3; i++) { +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1)); +#elif defined(__arm__) + rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]); +#endif } } } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } @@ -1593,8 +1683,6 @@ void pdcch_alamouti(LTE_DL_FRAME_PARMS *frame_parms, } } - _mm_empty(); - _m_empty(); } @@ -2008,7 +2096,7 @@ uint8_t generate_dci_top(uint8_t num_ue_spec_dci, //memset(e, 2, DCI_BITS_MAX); // here we interpret NIL as a random QPSK sequence. That makes power estimation easier. for (i=0; i<DCI_BITS_MAX; i++) - e[i]=taus()&1; + e[i]=0;//taus()&1; e_ptr = e; diff --git a/openair1/PHY/LTE_TRANSPORT/defs.h b/openair1/PHY/LTE_TRANSPORT/defs.h index 847beb5b09..444a2b1262 100644 --- a/openair1/PHY/LTE_TRANSPORT/defs.h +++ b/openair1/PHY/LTE_TRANSPORT/defs.h @@ -139,7 +139,7 @@ typedef struct { /// Concatenated "e"-sequences (for definition see 36-212 V8.6 2009-03, p.17-18) uint8_t e[MAX_NUM_CHANNEL_BITS]; /// Turbo-code outputs (36-212 V8.6 2009-03, p.12 - uint8_t d[MAX_NUM_DLSCH_SEGMENTS][(96+3+(3*6144))]; + uint8_t *d[MAX_NUM_DLSCH_SEGMENTS];//[(96+3+(3*6144))]; /// Sub-block interleaver outputs (36-212 V8.6 2009-03, p.16-17) uint8_t w[MAX_NUM_DLSCH_SEGMENTS][3*6144]; /// Number of code segments (for definition see 36-212 V8.6 2009-03, p.9) diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c b/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c index 5dd2f9b5f9..1a93020447 100644 --- a/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c +++ b/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c @@ -100,6 +100,10 @@ void free_eNB_dlsch(LTE_eNB_DLSCH_t *dlsch) free16(dlsch->harq_processes[i]->c[r],((r==0)?8:0) + 3+768); dlsch->harq_processes[i]->c[r] = NULL; } + if (dlsch->harq_processes[i]->d[r]) { + free16(dlsch->harq_processes[i]->d[r],(96+3+(3*6144))); + dlsch->harq_processes[i]->d[r] = NULL; + } } free16(dlsch->harq_processes[i],sizeof(LTE_DL_eNB_HARQ_t)); @@ -168,14 +172,20 @@ LTE_eNB_DLSCH_t *new_eNB_dlsch(unsigned char Kmimo,unsigned char Mdlharq,unsigne if (abstraction_flag==0) { for (r=0; r<MAX_NUM_DLSCH_SEGMENTS/bw_scaling; r++) { // account for filler in first segment and CRCs for multiple segment case - dlsch->harq_processes[i]->c[r] = (unsigned char*)malloc16(((r==0)?8:0) + 3+ 768); - + dlsch->harq_processes[i]->c[r] = (uint8_t*)malloc16(((r==0)?8:0) + 3+ 768); + dlsch->harq_processes[i]->d[r] = (uint8_t*)malloc16((96+3+(3*6144))); if (dlsch->harq_processes[i]->c[r]) { bzero(dlsch->harq_processes[i]->c[r],((r==0)?8:0) + 3+ 768); } else { msg("Can't get c\n"); exit_flag=2; } + if (dlsch->harq_processes[i]->d[r]) { + bzero(dlsch->harq_processes[i]->d[r],(96+3+(3*6144))); + } else { + msg("Can't get d\n"); + exit_flag=2; + } } } } else { @@ -190,8 +200,10 @@ LTE_eNB_DLSCH_t *new_eNB_dlsch(unsigned char Kmimo,unsigned char Mdlharq,unsigne if (abstraction_flag==0) { for (j=0; j<96; j++) - for (r=0; r<MAX_NUM_DLSCH_SEGMENTS; r++) + for (r=0; r<MAX_NUM_DLSCH_SEGMENTS/bw_scaling; r++) { + // printf("dlsch->harq_processes[%d]->d[%d] %p\n",i,r,dlsch->harq_processes[i]->d[r]); dlsch->harq_processes[i]->d[r][j] = LTE_NULL; + } } } diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c index 61d91cc217..cce09acd41 100644 --- a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c +++ b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c @@ -64,18 +64,18 @@ unsigned char offset_mumimo_llr_drange[29][3]={{8,8,8},{7,7,7},{7,7,7},{7,7,7},{ {5,5,4},{5,5,5},{5,5,5},{3,3,3},{2,2,2},{2,2,2},{2,2,2}, // 16-QAM {2,2,1},{3,3,3},{3,3,3},{3,3,1},{2,2,2},{2,2,2},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}}; //64-QAM */ -/* -//first optimization try -unsigned char offset_mumimo_llr_drange[29][3]={{7, 8, 7},{6, 6, 7},{6, 6, 7},{6, 6, 6},{5, 6, 6},{5, 5, 6},{5, 5, 6},{4, 5, 4},{4, 3, 4},{3, 2, 2},{6, 5, 5},{5, 4, 4},{5, 5, 4},{3, 3, 2},{2, 2, 1},{2, 1, 1},{2, 2, 2},{3, 3, 3},{3, 3, 2},{3, 3, 2},{3, 2, 1},{2, 2, 2},{2, 2, 2},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}}; -*/ -//second optimization try -/* - unsigned char offset_mumimo_llr_drange[29][3]={{5, 8, 7},{4, 6, 8},{3, 6, 7},{7, 7, 6},{4, 7, 8},{4, 7, 4},{6, 6, 6},{3, 6, 6},{3, 6, 6},{1, 3, 4},{1, 1, 0},{3, 3, 2},{3, 4, 1},{4, 0, 1},{4, 2, 2},{3, 1, 2},{2, 1, 0},{2, 1, 1},{1, 0, 1},{1, 0, 1},{0, 0, 0},{1, 0, 0},{0, 0, 0},{0, 1, 0},{1, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}}; w -*/ + /* + //first optimization try + unsigned char offset_mumimo_llr_drange[29][3]={{7, 8, 7},{6, 6, 7},{6, 6, 7},{6, 6, 6},{5, 6, 6},{5, 5, 6},{5, 5, 6},{4, 5, 4},{4, 3, 4},{3, 2, 2},{6, 5, 5},{5, 4, 4},{5, 5, 4},{3, 3, 2},{2, 2, 1},{2, 1, 1},{2, 2, 2},{3, 3, 3},{3, 3, 2},{3, 3, 2},{3, 2, 1},{2, 2, 2},{2, 2, 2},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}}; + */ + //second optimization try + /* + unsigned char offset_mumimo_llr_drange[29][3]={{5, 8, 7},{4, 6, 8},{3, 6, 7},{7, 7, 6},{4, 7, 8},{4, 7, 4},{6, 6, 6},{3, 6, 6},{3, 6, 6},{1, 3, 4},{1, 1, 0},{3, 3, 2},{3, 4, 1},{4, 0, 1},{4, 2, 2},{3, 1, 2},{2, 1, 0},{2, 1, 1},{1, 0, 1},{1, 0, 1},{0, 0, 0},{1, 0, 0},{0, 0, 0},{0, 1, 0},{1, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}}; w + */ unsigned char offset_mumimo_llr_drange[29][3]= {{0, 6, 5},{0, 4, 5},{0, 4, 5},{0, 5, 4},{0, 5, 6},{0, 5, 3},{0, 4, 4},{0, 4, 4},{0, 3, 3},{0, 1, 2},{1, 1, 0},{1, 3, 2},{3, 4, 1},{2, 0, 0},{2, 2, 2},{1, 1, 1},{2, 1, 0},{2, 1, 1},{1, 0, 1},{1, 0, 1},{0, 0, 0},{1, 0, 0},{0, 0, 0},{0, 1, 0},{1, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}}; -extern void print_shorts(char *s,__m128i *x); +extern void print_shorts(char *s,int16_t *x); int rx_pdsch(PHY_VARS_UE *phy_vars_ue, PDSCH_t type, @@ -645,11 +645,11 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue, if (get_Qm(dlsch1_harq->mcs) == 2) { /* dlsch_qpsk_llr(frame_parms, - lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0, - lte_ue_pdsch_vars[eNB_id]->llr[0], - symbol,first_symbol_flag,nb_rb, - adjust_G2(frame_parms,dlsch0_harq->rb_alloc,2,subframe,symbol), - lte_ue_pdsch_vars[eNB_id]->llr128); + lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0, + lte_ue_pdsch_vars[eNB_id]->llr[0], + symbol,first_symbol_flag,nb_rb, + adjust_G2(frame_parms,dlsch0_harq->rb_alloc,2,subframe,symbol), + lte_ue_pdsch_vars[eNB_id]->llr128); */ dlsch_qpsk_qpsk_llr(frame_parms, lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0, @@ -868,7 +868,9 @@ void dlsch_channel_compensation(int **rxdataF_ext, dl_ch_mag128b[0] = dl_ch_mag128[0]; dl_ch_mag128[0] = _mm_mulhi_epi16(dl_ch_mag128[0],QAM_amp128); dl_ch_mag128[0] = _mm_slli_epi16(dl_ch_mag128[0],1); - + //print_ints("Re(ch):",(int16_t*)&mmtmpD0); + //print_shorts("QAM_amp:",(int16_t*)&QAM_amp128); + //print_shorts("mag:",(int16_t*)&dl_ch_mag128[0]); dl_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpD0,mmtmpD0); dl_ch_mag128b[1] = dl_ch_mag128[1]; dl_ch_mag128[1] = _mm_mulhi_epi16(dl_ch_mag128[1],QAM_amp128); @@ -1068,12 +1070,14 @@ void dlsch_channel_compensation(int **rxdataF_ext, unsigned short rb; unsigned char aatx,aarx,symbol_mod,pilots=0; - int16x4_t *dl_ch128,*dl_ch128_2,*rxdataF128,*rho128; - int32x4_t mmtmpD0,mmtmpD1; - int16x8_t *dl_ch_mag128,*dl_ch_mag128b,mmtmpD2,mmtmpD3,*rxdataF_comp128; - int16x4_t QAM_amp128,QAM_amp128b; + int16x4_t *dl_ch128,*dl_ch128_2,*rxdataF128; + int32x4_t mmtmpD0,mmtmpD1,mmtmpD0b,mmtmpD1b; + int16x8_t *dl_ch_mag128,*dl_ch_mag128b,mmtmpD2,mmtmpD3,mmtmpD4; + int16x8_t QAM_amp128,QAM_amp128b; + int16x4x2_t *rxdataF_comp128,*rho128; int16_t conj[4]__attribute__((aligned(16))) = {1,-1,1,-1}; + int32x4_t output_shift128 = vmovq_n_s32(-(int32_t)output_shift); symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; @@ -1081,7 +1085,6 @@ void dlsch_channel_compensation(int **rxdataF_ext, if (frame_parms->mode1_flag==1) { // 10 out of 12 so don't reduce size nb_rb=1+(5*nb_rb/6); } - else { pilots=1; } @@ -1089,177 +1092,177 @@ void dlsch_channel_compensation(int **rxdataF_ext, for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) { if (mod_order == 4) { - QAM_amp128 = vmov_n_s16(QAM16_n1); // 2/sqrt(10) - QAM_amp128b = vmov_n_s16(0); - + QAM_amp128 = vmovq_n_s16(QAM16_n1); // 2/sqrt(10) + QAM_amp128b = vmovq_n_s16(0); } else if (mod_order == 6) { - QAM_amp128 = vmov_n_s16(QAM64_n1); // - QAM_amp128b = vmov_n_s16(QAM64_n2); + QAM_amp128 = vmovq_n_s16(QAM64_n1); // + QAM_amp128b = vmovq_n_s16(QAM64_n2); } - // printf("comp: rxdataF_comp %p, symbol %d\n",rxdataF_comp[0],symbol); for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { - - - dl_ch128 = (int16x4_t*)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag128 = (int16x8_t*)&dl_ch_mag[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag128b = (int16x8_t*)&dl_ch_magb[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (int16x4_t*)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; - rxdataF_comp128 = (int16x8_t*)&rxdataF_comp[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12]; - + rxdataF_comp128 = (int16x4x2_t*)&rxdataF_comp[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12]; + for (rb=0; rb<nb_rb; rb++) { - if (mod_order>2) { - // get channel amplitude if not QPSK - mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128[0]); - // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3]; - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift on 32-bits - mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128[1]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD2 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift on 16-bits - mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128[2]); - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128[3]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD3 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - if (pilots==0) { - mmtmpD0 = vmull_s16(dl_ch128[4], dl_ch128[4]); - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - mmtmpD1 = vmull_s16(dl_ch128[5], dl_ch128[5]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD4 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - - } - - dl_ch_mag128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128b); - dl_ch_mag128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128b); - dl_ch_mag128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128); - dl_ch_mag128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128); - - - if (pilots==0) { - dl_ch_mag128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128b); - dl_ch_mag128[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128); - } - } - - mmtmpD0 = vmull_s16(dl_ch128[0], rx_dataF128[0]); - //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] - mmtmpD1 = vmull_s16(dl_ch128[1], rx_dataF128[1]); - //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] - - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[0],*(int16x4_t*)conj)), rx_dataF128[0]); - //mmtmpD0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[1],*(int16x4_t*)conj)), rx_dataF128[1]); - //mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - //mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - mmtmpD0 = vmull_s16(dl_ch128[2], rx_dataF128[2]); - mmtmpD1 = vmull_s16(dl_ch128[3], rx_dataF128[3]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[2],*(int16x4_t*)conj)), rx_dataF128[2]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[3],*(int16x4_t*)conj)), rx_dataF128[3]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - if (pilots==0) { - mmtmpD0 = vmull_s16(dl_ch128[4], rx_dataF128[4]); - mmtmpD1 = vmull_s16(dl_ch128[5], rx_dataF128[5]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[4],*(int16x4_t*)conj)), rx_dataF128[4]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[5],*(int16x4_t*)conj)), rx_dataF128[5]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - - dl_ch128+=6; - dl_ch_mag128+=3; - dl_ch_mag128b+=3; - rxdataF128+=6; - rxdataF_comp128+=3; - - } else { // we have a smaller PDSCH in symbols with pilots so skip last group of 4 REs and increment less - dl_ch128+=4; - dl_ch_mag128+=2; - dl_ch_mag128b+=2; - rxdataF128+=4; - rxdataF_comp128+=2; - } + if (mod_order>2) { + // get channel amplitude if not QPSK + mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128[0]); + // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3]; + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift128 on 32-bits + mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128[1]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD2 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift128 on 16-bits + mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128[2]); + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128[3]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD3 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + if (pilots==0) { + mmtmpD0 = vmull_s16(dl_ch128[4], dl_ch128[4]); + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + mmtmpD1 = vmull_s16(dl_ch128[5], dl_ch128[5]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD4 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + } + + dl_ch_mag128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128b); + dl_ch_mag128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128b); + dl_ch_mag128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128); + dl_ch_mag128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128); + + if (pilots==0) { + dl_ch_mag128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128b); + dl_ch_mag128[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128); + } + } + + mmtmpD0 = vmull_s16(dl_ch128[0], rxdataF128[0]); + //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] + mmtmpD1 = vmull_s16(dl_ch128[1], rxdataF128[1]); + //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[0],*(int16x4_t*)conj)), rxdataF128[0]); + //mmtmpD0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[1],*(int16x4_t*)conj)), rxdataF128[1]); + //mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + //mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp128[0] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + mmtmpD0 = vmull_s16(dl_ch128[2], rxdataF128[2]); + mmtmpD1 = vmull_s16(dl_ch128[3], rxdataF128[3]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[2],*(int16x4_t*)conj)), rxdataF128[2]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[3],*(int16x4_t*)conj)), rxdataF128[3]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp128[1] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + if (pilots==0) { + mmtmpD0 = vmull_s16(dl_ch128[4], rxdataF128[4]); + mmtmpD1 = vmull_s16(dl_ch128[5], rxdataF128[5]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[4],*(int16x4_t*)conj)), rxdataF128[4]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[5],*(int16x4_t*)conj)), rxdataF128[5]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp128[2] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + + dl_ch128+=6; + dl_ch_mag128+=3; + dl_ch_mag128b+=3; + rxdataF128+=6; + rxdataF_comp128+=3; + + } else { // we have a smaller PDSCH in symbols with pilots so skip last group of 4 REs and increment less + dl_ch128+=4; + dl_ch_mag128+=2; + dl_ch_mag128b+=2; + rxdataF128+=4; + rxdataF_comp128+=2; + } } } } - + if (rho) { for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { - rho128 = (int16x8_t*)&rho[aarx][symbol*frame_parms->N_RB_DL*12]; + rho128 = (int16x4x2_t*)&rho[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch128 = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch128_2 = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; - for (rb=0; rb<nb_rb; rb++) { - - mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128_2[0]); - mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128_2[1]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[0],*(int16x4_t*)conj)), dl_ch128_2[0]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[1],*(int16x4_t*)conj)), dl_ch128_2[1]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rho128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128_2[2]); - mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128_2[3]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[2],*(int16x4_t*)conj)), dl_ch128_2[2]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[3],*(int16x4_t*)conj)), dl_ch128_2[3]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rho128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128_2[0]); - mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128_2[1]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vrev32q_s16(vmul_s16(dl_ch128[4],*(int16x4_t*)conj), dl_ch128_2[4]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[5],*(int16x4_t*)conj)), dl_ch128_2[5]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rho128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - - dl_ch128+=6; - dl_ch128_2+=6; - rho128+=3; + mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128_2[0]); + mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128_2[1]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[0],*(int16x4_t*)conj)), dl_ch128_2[0]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[1],*(int16x4_t*)conj)), dl_ch128_2[1]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rho128[0] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128_2[2]); + mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128_2[3]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[2],*(int16x4_t*)conj)), dl_ch128_2[2]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[3],*(int16x4_t*)conj)), dl_ch128_2[3]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rho128[1] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128_2[0]); + mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128_2[1]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[4],*(int16x4_t*)conj)), dl_ch128_2[4]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[5],*(int16x4_t*)conj)), dl_ch128_2[5]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rho128[2] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + + dl_ch128+=6; + dl_ch128_2+=6; + rho128+=3; } - + if (first_symbol_flag==1) { - phy_measurements->rx_correlation[0][aarx] = signal_energy(&rho[aarx][symbol*frame_parms->N_RB_DL*12],rb*12); + phy_measurements->rx_correlation[0][aarx] = signal_energy(&rho[aarx][symbol*frame_parms->N_RB_DL*12],rb*12); } } } - #endif } @@ -1312,37 +1315,39 @@ void prec2A_TM56_128(unsigned char pmi,__m128i *ch0,__m128i *ch1) } #elif defined(__arm__) -void prec2A_TM56_128(unsigned char pmi,int16x8_t* ch0,int16x8_t* ch1) -{ +void prec2A_TM56_128(unsigned char pmi,int16x8_t* ch0,int16x8_t* ch1) { + int16x8_t amp; + int16_t conj[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; + amp = vmovq_n_s16(ONE_OVER_SQRT2_Q15); switch (pmi) { case 0 : // +1 +1 // print_shorts("phase 0 :ch0",ch0); // print_shorts("phase 0 :ch1",ch1); - ch0[0] = vqadd_s16(ch0[0],ch1[0]); + ch0[0] = vqaddq_s16(ch0[0],ch1[0]); break; - + case 1 : // +1 -1 // print_shorts("phase 1 :ch0",ch0); // print_shorts("phase 1 :ch1",ch1); - ch0[0] = vqsub_s16(ch0[0],ch1[0]); + ch0[0] = vqsubq_s16(ch0[0],ch1[0]); // print_shorts("phase 1 :ch0-ch1",ch0); break; - + case 2 : // +1 +j - ch1[0] = vrev32q_s16(vmul_s16(ch1[0],*(int16x4_t*)conj)); - ch0[0] = vqsub_s16(ch0[0],ch1[0]); + ch1[0] = vrev32q_s16(vmulq_s16(ch1[0],*(int16x8_t*)conj)); + ch0[0] = vqsubq_s16(ch0[0],ch1[0]); break; // +1 -j - + case 3 : - ch1[0] = vrev32q_s16(vmul_s16(ch1[0],*(int16x4_t*)conj)); - ch0[0] = vqadd_s16(ch0[0],ch1[0]); + ch1[0] = vrev32q_s16(vmulq_s16(ch1[0],*(int16x8_t*)conj)); + ch0[0] = vqaddq_s16(ch0[0],ch1[0]); break; } - - ch0[0] = vmulhq_s16(ch0[0],amp); + + ch0[0] = vqdmulhq_s16(ch0[0],amp); } #endif @@ -1435,25 +1440,26 @@ void prec2A_TM4_128(int pmi,__m128i *ch0,__m128i *ch1) #elif defined(__arm__) -void prec2A_TM4_128(int pmi,__m128i *ch0,__m128i *ch1) +void prec2A_TM4_128(int pmi,int16x8_t *ch0,int16x8_t *ch1) { - int16x6_t amp; + int16x8_t amp; int16x8_t tmp0,tmp1; + int16_t conj[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; - amp = = vmovq_n_s16(ONE_OVER_SQRT2_Q15); + amp = vmovq_n_s16(ONE_OVER_SQRT2_Q15); if (pmi == 0) { - ch0[0] = vqadd_s16(ch0[0],ch1[0]); - ch1[0] = vqsub_s16(ch0[0],ch1[0]); + ch0[0] = vqaddq_s16(ch0[0],ch1[0]); + ch1[0] = vqsubq_s16(ch0[0],ch1[0]); } else { tmp0 = ch0[0]; - tmp1 = vrev32q_s16(vmul_s16(ch1[0],*(int16x4_t*)conj)); - ch0[0] = vqadd_s16(tmp0,tmp1); - ch1[0] = vqsub_s16(tmp0,tmp1); + tmp1 = vrev32q_s16(vmulq_s16(ch1[0],*(int16x8_t*)conj)); + ch0[0] = vqaddq_s16(tmp0,tmp1); + ch1[0] = vqsubq_s16(tmp0,tmp1); } - ch0[0] = vmulhq_s16(ch0[0],amp); - ch1[0] = vmulhq_s16(ch1[0],amp); + ch0[0] = vqdmulhq_s16(ch0[0],amp); + ch1[0] = vqdmulhq_s16(ch1[0],amp); } #endif @@ -1478,7 +1484,7 @@ void dlsch_channel_compensation_TM56(int **rxdataF_ext, unsigned short rb,Nre; __m128i *dl_ch0_128,*dl_ch1_128,*dl_ch_mag128,*dl_ch_mag128b,*rxdataF128,*rxdataF_comp128; unsigned char aarx=0,symbol_mod,pilots=0; - int precoded_signal_strength=0,rx_power_correction; + int precoded_signal_strength=0; __m128i mmtmpD0,mmtmpD1,mmtmpD2,mmtmpD3,QAM_amp128,QAM_amp128b; symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; @@ -1486,7 +1492,6 @@ void dlsch_channel_compensation_TM56(int **rxdataF_ext, if ((symbol_mod == 0) || (symbol_mod == (4-frame_parms->Ncp))) pilots=1; - rx_power_correction = 1; //printf("comp prec: symbol %d, pilots %d\n",symbol, pilots); @@ -1652,7 +1657,7 @@ void dlsch_channel_compensation_TM56(int **rxdataF_ext, Nre = (pilots==0) ? 12 : 8; precoded_signal_strength += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre], - (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx])); + (nb_rb*Nre))) - (phy_measurements->n0_power[aarx])); } // rx_antennas phy_measurements->precoded_cqi_dB[eNB_id][0] = dB_fixed2(precoded_signal_strength,phy_measurements->n0_power_tot); @@ -1665,168 +1670,176 @@ void dlsch_channel_compensation_TM56(int **rxdataF_ext, #elif defined(__arm__) - unsigned short rb; - unsigned char aatx,aarx,symbol_mod,pilots=0; - - int16x4_t *dl_ch128,*dl_ch128_2,*rxdataF128,*rho128; - int32x4_t mmtmpD0,mmtmpD1; - int16x8_t *dl_ch_mag128,*dl_ch_mag128b,mmtmpD2,mmtmpD3,*rxdataF_comp128; - int16x4_t QAM_amp128,QAM_amp128b; - + uint32_t rb,Nre; + uint32_t aarx,symbol_mod,pilots=0; + + int16x4_t *dl_ch0_128,*dl_ch1_128,*rxdataF128; + int16x8_t *dl_ch0_128b,*dl_ch1_128b; + int32x4_t mmtmpD0,mmtmpD1,mmtmpD0b,mmtmpD1b; + int16x8_t *dl_ch_mag128,*dl_ch_mag128b,mmtmpD2,mmtmpD3,mmtmpD4,*rxdataF_comp128; + int16x8_t QAM_amp128,QAM_amp128b; + int16_t conj[4]__attribute__((aligned(16))) = {1,-1,1,-1}; - - symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; - + int32x4_t output_shift128 = vmovq_n_s32(-(int32_t)output_shift); + int32_t precoded_signal_strength=0; + + symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; if ((symbol_mod == 0) || (symbol_mod == (4-frame_parms->Ncp))) { - if (frame_parms->mode1_flag==1) { // 10 out of 12 so don't reduce size - nb_rb=1+(5*nb_rb/6); - } - - else { - pilots=1; - } + if (frame_parms->mode1_flag==1) // 10 out of 12 so don't reduce size + { nb_rb=1+(5*nb_rb/6); } + + else + { pilots=1; } } - - + + if (mod_order == 4) { - QAM_amp128 = vmov_n_s16(QAM16_n1); // 2/sqrt(10) - QAM_amp128b = vmov_n_s16(0); - + QAM_amp128 = vmovq_n_s16(QAM16_n1); // 2/sqrt(10) + QAM_amp128b = vmovq_n_s16(0); + } else if (mod_order == 6) { - QAM_amp128 = vmov_n_s16(QAM64_n1); // - QAM_amp128b = vmov_n_s16(QAM64_n2); + QAM_amp128 = vmovq_n_s16(QAM64_n1); // + QAM_amp128b = vmovq_n_s16(QAM64_n2); } - + // printf("comp: rxdataF_comp %p, symbol %d\n",rxdataF_comp[0],symbol); - + for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { - - - - dl_ch1_128 = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; - dl_ch2_128 = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; - dl_ch_mag128 = (int16x8_t*)&dl_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; - dl_ch_mag128b = (int16x8_t*)&dl_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; - rxdataF128 = (int16x4_t*)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; - rxdataF_comp128 = (int16x8_t*)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; - + + + + dl_ch0_128 = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch1_128 = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch0_128b = (int16x8_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch1_128b = (int16x8_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128 = (int16x8_t*)&dl_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128b = (int16x8_t*)&dl_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; + rxdataF128 = (int16x4_t*)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128 = (int16x8_t*)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; + for (rb=0; rb<nb_rb; rb++) { #ifdef DEBUG_DLSCH_DEMOD printf("mode 6 prec: rb %d, pmi->%d\n",rb,pmi_ext[rb]); #endif - prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128[0],&dl_ch1_128[0]); - prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128[1],&dl_ch1_128[1]); - + prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128b[0],&dl_ch1_128b[0]); + prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128b[1],&dl_ch1_128b[1]); + if (pilots==0) { - prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128[2],&dl_ch1_128[2]); + prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128b[2],&dl_ch1_128b[2]); } - + if (mod_order>2) { - // get channel amplitude if not QPSK - mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128[0]); - // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3]; - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift on 32-bits - mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128[1]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD2 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift on 16-bits - mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128[2]); - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128[3]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD3 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - if (pilots==0) { - mmtmpD0 = vmull_s16(dl_ch128[4], dl_ch128[4]); - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - mmtmpD1 = vmull_s16(dl_ch128[5], dl_ch128[5]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD4 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - - } - - dl_ch_mag128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128b); - dl_ch_mag128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128b); - dl_ch_mag128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128); - dl_ch_mag128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128); - - - if (pilots==0) { - dl_ch_mag128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128b); - dl_ch_mag128[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128); - } + // get channel amplitude if not QPSK + mmtmpD0 = vmull_s16(dl_ch0_128[0], dl_ch0_128[0]); + // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3]; + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift128 on 32-bits + mmtmpD1 = vmull_s16(dl_ch0_128[1], dl_ch0_128[1]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD2 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift128 on 16-bits + mmtmpD0 = vmull_s16(dl_ch0_128[2], dl_ch0_128[2]); + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + mmtmpD1 = vmull_s16(dl_ch0_128[3], dl_ch0_128[3]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD3 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + if (pilots==0) { + mmtmpD0 = vmull_s16(dl_ch0_128[4], dl_ch0_128[4]); + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + mmtmpD1 = vmull_s16(dl_ch0_128[5], dl_ch0_128[5]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD4 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + + } + + dl_ch_mag128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128b); + dl_ch_mag128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128b); + dl_ch_mag128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128); + dl_ch_mag128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128); + + + if (pilots==0) { + dl_ch_mag128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128b); + dl_ch_mag128[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128); + } } - - mmtmpD0 = vmull_s16(dl_ch128[0], rx_dataF128[0]); - //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] - mmtmpD1 = vmull_s16(dl_ch128[1], rx_dataF128[1]); - //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] - - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[0],*(int16x4_t*)conj)), rx_dataF128[0]); + mmtmpD0 = vmull_s16(dl_ch0_128[0], rxdataF128[0]); + //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] + mmtmpD1 = vmull_s16(dl_ch0_128[1], rxdataF128[1]); + //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rxdataF128[0]); //mmtmpD0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[1],*(int16x4_t*)conj)), rx_dataF128[1]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rxdataF128[1]); //mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); //mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - mmtmpD0 = vmull_s16(dl_ch128[2], rx_dataF128[2]); - mmtmpD1 = vmull_s16(dl_ch128[3], rx_dataF128[3]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[2],*(int16x4_t*)conj)), rx_dataF128[2]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[3],*(int16x4_t*)conj)), rx_dataF128[3]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp128[0] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + mmtmpD0 = vmull_s16(dl_ch0_128[2], rxdataF128[2]); + mmtmpD1 = vmull_s16(dl_ch0_128[3], rxdataF128[3]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rxdataF128[2]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rxdataF128[3]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp128[1] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + if (pilots==0) { - mmtmpD0 = vmull_s16(dl_ch128[4], rx_dataF128[4]); - mmtmpD1 = vmull_s16(dl_ch128[5], rx_dataF128[5]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[4],*(int16x4_t*)conj)), rx_dataF128[4]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[5],*(int16x4_t*)conj)), rx_dataF128[5]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - - dl_ch128+=6; - dl_ch_mag128+=3; - dl_ch_mag128b+=3; - rxdataF128+=6; - rxdataF_comp128+=3; - + mmtmpD0 = vmull_s16(dl_ch0_128[4], rxdataF128[4]); + mmtmpD1 = vmull_s16(dl_ch0_128[5], rxdataF128[5]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[4],*(int16x4_t*)conj)), rxdataF128[4]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[5],*(int16x4_t*)conj)), rxdataF128[5]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp128[2] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + + dl_ch0_128+=6; + dl_ch1_128+=6; + dl_ch_mag128+=3; + dl_ch_mag128b+=3; + rxdataF128+=6; + rxdataF_comp128+=3; + } else { // we have a smaller PDSCH in symbols with pilots so skip last group of 4 REs and increment less - dl_ch128+=4; - dl_ch_mag128+=2; - dl_ch_mag128b+=2; - rxdataF128+=4; - rxdataF_comp128+=2; + dl_ch0_128+=4; + dl_ch1_128+=4; + dl_ch_mag128+=2; + dl_ch_mag128b+=2; + rxdataF128+=4; + rxdataF_comp128+=2; } } - - - + Nre = (pilots==0) ? 12 : 8; - + precoded_signal_strength += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre], - (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx])); + (nb_rb*Nre))) - (phy_measurements->n0_power[aarx])); // rx_antennas } - phy_measurements->precoded_cqi_dB[eNB_id][0] = dB_fixed2(precoded_signal_strength,phy_measurements->n0_power_tot); - + #endif } @@ -1847,7 +1860,7 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, unsigned short rb,Nre; __m128i *dl_ch0_128,*dl_ch1_128,*dl_ch_mag0_128,*dl_ch_mag1_128,*dl_ch_mag0_128b,*dl_ch_mag1_128b,*rxdataF128,*rxdataF_comp0_128,*rxdataF_comp1_128; unsigned char aarx=0,symbol_mod,pilots=0; - int precoded_signal_strength0=0,precoded_signal_strength1=0,rx_power_correction; + int precoded_signal_strength0=0,precoded_signal_strength1=0; int **rxdataF_ext = lte_ue_pdsch_vars->rxdataF_ext; int **dl_ch_estimates_ext = lte_ue_pdsch_vars->dl_ch_estimates_ext; @@ -1865,7 +1878,6 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, if ((symbol_mod == 0) || (symbol_mod == (4-frame_parms->Ncp))) pilots=1; - rx_power_correction = 1; //printf("comp prec: symbol %d, pilots %d\n",symbol, pilots); @@ -2051,7 +2063,7 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, rxdataF_comp0_128[1] = _mm_packs_epi32(mmtmpD2,mmtmpD3); // print_shorts("rx:",rxdataF128+1); - // print_shorts("ch:",dl_ch128+1); + // print_shorts("ch:",dl_ch0_128+1); // print_shorts("pack:",rxdataF_comp128+1); if (pilots==0) { @@ -2070,7 +2082,7 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, rxdataF_comp0_128[2] = _mm_packs_epi32(mmtmpD2,mmtmpD3); // print_shorts("rx:",rxdataF128+2); - // print_shorts("ch:",dl_ch128+2); + // print_shorts("ch:",dl_ch0_128+2); // print_shorts("pack:",rxdataF_comp128+2); } else { @@ -2162,15 +2174,15 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, } } // rb loop + } + + Nre = (pilots==0) ? 12 : 8; - Nre = (pilots==0) ? 12 : 8; - - precoded_signal_strength0 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre], - (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx])); + precoded_signal_strength0 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre], + (nb_rb*Nre))) - (phy_measurements->n0_power[aarx])); - precoded_signal_strength1 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx+2][symbol*frame_parms->N_RB_DL*Nre], - (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx])); - } // rx_antennas + precoded_signal_strength1 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx+2][symbol*frame_parms->N_RB_DL*Nre], + (nb_rb*Nre))) - (phy_measurements->n0_power[aarx])); phy_measurements->precoded_cqi_dB[eNB_id][0] = dB_fixed2(precoded_signal_strength0,phy_measurements->n0_power_tot); phy_measurements->precoded_cqi_dB[eNB_id][1] = dB_fixed2(precoded_signal_strength1,phy_measurements->n0_power_tot); @@ -2183,14 +2195,18 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, #elif defined(__arm__) - unsigned short rb; - unsigned char aatx,aarx,symbol_mod,pilots=0; - - int16x4_t *dl_ch128,*dl_ch128_2,*rxdataF128; - int32x4_t mmtmpD0,mmtmpD1; - int16x8_t *dl_ch_mag0_128,*dl_ch_mag1_128b,mmtmpD2,mmtmpD3,*rxdataF_comp0_128,*rxdataF_comp1_128; - int16x4_t QAM_amp0_128,QAM_amp1_128b; - + unsigned short rb,Nre; + unsigned char aarx,symbol_mod,pilots=0; + int precoded_signal_strength0=0,precoded_signal_strength1=0; + int16x4_t *dl_ch0_128,*rxdataF128; + int16x4_t *dl_ch1_128; + int16x8_t *dl_ch0_128b,*dl_ch1_128b; + + int32x4_t mmtmpD0,mmtmpD1,mmtmpD0b,mmtmpD1b; + int16x8_t *dl_ch_mag0_128,*dl_ch_mag0_128b,*dl_ch_mag1_128,*dl_ch_mag1_128b,mmtmpD2,mmtmpD3,mmtmpD4,*rxdataF_comp0_128,*rxdataF_comp1_128; + int16x8_t QAM_amp0_128,QAM_amp0_128b,QAM_amp1_128,QAM_amp1_128b; + int32x4_t output_shift128 = vmovq_n_s32(-(int32_t)output_shift); + int **rxdataF_ext = lte_ue_pdsch_vars->rxdataF_ext; int **dl_ch_estimates_ext = lte_ue_pdsch_vars->dl_ch_estimates_ext; int **dl_ch_mag0 = lte_ue_pdsch_vars->dl_ch_mag0; @@ -2199,255 +2215,258 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, int **dl_ch_magb1 = lte_ue_pdsch_vars->dl_ch_magb1; int **rxdataF_comp0 = lte_ue_pdsch_vars->rxdataF_comp0; int **rxdataF_comp1 = lte_ue_pdsch_vars->rxdataF_comp1[round]; //? - + int16_t conj[4]__attribute__((aligned(16))) = {1,-1,1,-1}; - + symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; - + if ((symbol_mod == 0) || (symbol_mod == (4-frame_parms->Ncp))) { - if (frame_parms->mode1_flag==1) { // 10 out of 12 so don't reduce size - nb_rb=1+(5*nb_rb/6); - } - - else { - pilots=1; - } + if (frame_parms->mode1_flag==1) // 10 out of 12 so don't reduce size + { nb_rb=1+(5*nb_rb/6); } + + else + { pilots=1; } } - - + + if (mod_order0 == 4) { - QAM_amp0_128 = vmov_n_s16(QAM16_n1); // 2/sqrt(10) - QAM_amp0_128b = vmov_n_s16(0); - + QAM_amp0_128 = vmovq_n_s16(QAM16_n1); // 2/sqrt(10) + QAM_amp0_128b = vmovq_n_s16(0); + } else if (mod_order0 == 6) { - QAM_amp0_128 = vmov_n_s16(QAM64_n1); // - QAM_amp0_128b = vmov_n_s16(QAM64_n2); + QAM_amp0_128 = vmovq_n_s16(QAM64_n1); // + QAM_amp0_128b = vmovq_n_s16(QAM64_n2); } - + if (mod_order1 == 4) { - QAM_amp1_128 = vmov_n_s16(QAM16_n1); // 2/sqrt(10) - QAM_amp1_128b = vmov_n_s16(0); - + QAM_amp1_128 = vmovq_n_s16(QAM16_n1); // 2/sqrt(10) + QAM_amp1_128b = vmovq_n_s16(0); + } else if (mod_order1 == 6) { - QAM_amp1_128 = vmov_n_s16(QAM64_n1); // - QAM_amp1_128b = vmov_n_s16(QAM64_n2); + QAM_amp1_128 = vmovq_n_s16(QAM64_n1); // + QAM_amp1_128b = vmovq_n_s16(QAM64_n2); } - + // printf("comp: rxdataF_comp %p, symbol %d\n",rxdataF_comp[0],symbol); - + for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { - - - - dl_ch1_128 = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; - dl_ch2_128 = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; + + + + dl_ch0_128 = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch1_128 = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch0_128b = (int16x8_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch1_128b = (int16x8_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag0_128 = (int16x8_t*)&dl_ch_mag0[aarx][symbol*frame_parms->N_RB_DL*12]; - dl_ch_mag0_128b = (int16x8_t*)&dl_ch_mag0b[aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag0_128b = (int16x8_t*)&dl_ch_magb0[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag1_128 = (int16x8_t*)&dl_ch_mag1[aarx][symbol*frame_parms->N_RB_DL*12]; - dl_ch_mag1_128b = (int16x8_t*)&dl_ch_mag1b[aarx][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag1_128b = (int16x8_t*)&dl_ch_magb1[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (int16x4_t*)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp0_128 = (int16x8_t*)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp1_128 = (int16x8_t*)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12]; - + for (rb=0; rb<nb_rb; rb++) { #ifdef DEBUG_DLSCH_DEMOD printf("mode 6 prec: rb %d, pmi->%d\n",rb,pmi_ext[rb]); #endif - prec2A_TM3_128(&dl_ch0_128[0],&dl_ch1_128[0]); - prec2A_TM3_128(&dl_ch0_128[1],&dl_ch1_128[1]); - + prec2A_TM3_128(&dl_ch0_128b[0],&dl_ch1_128b[0]); + prec2A_TM3_128(&dl_ch0_128b[1],&dl_ch1_128b[1]); + if (pilots==0) { - prec2A_TM3_128(&dl_ch0_128[2],&dl_ch1_128[2]); + prec2A_TM3_128(&dl_ch0_128b[2],&dl_ch1_128b[2]); } - + if (mod_order0>2) { - // get channel amplitude if not QPSK - mmtmpD0 = vmull_s16(dl_ch0_128[0], dl_ch0_128[0]); - // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3]; - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift on 32-bits - mmtmpD1 = vmull_s16(dl_ch0_128[1], dl_ch0_128[1]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD2 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift on 16-bits - mmtmpD0 = vmull_s16(dl_ch0_128[2], dl_ch0_128[2]); - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - mmtmpD1 = vmull_s16(dl_ch0_128[3], dl_ch0_128[3]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD3 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - if (pilots==0) { - mmtmpD0 = vmull_s16(dl_ch0_128[4], dl_ch0_128[4]); - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - mmtmpD1 = vmull_s16(dl_ch0_128[5], dl_ch0_128[5]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD4 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - - } - - dl_ch_mag0_128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp0_128b); - dl_ch_mag0_128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp0_128b); - dl_ch_mag0_128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp0_128); - dl_ch_mag0_128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp0_128); - - - if (pilots==0) { - dl_ch_mag0_128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp0_128b); - dl_ch_mag0_128[2] = vqdmulhq_s16(mmtmpD4,QAM_amp0_128); - } + // get channel amplitude if not QPSK + mmtmpD0 = vmull_s16(dl_ch0_128[0], dl_ch0_128[0]); + // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3]; + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift128 on 32-bits + mmtmpD1 = vmull_s16(dl_ch0_128[1], dl_ch0_128[1]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD2 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift128 on 16-bits + mmtmpD0 = vmull_s16(dl_ch0_128[2], dl_ch0_128[2]); + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + mmtmpD1 = vmull_s16(dl_ch0_128[3], dl_ch0_128[3]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD3 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + if (pilots==0) { + mmtmpD0 = vmull_s16(dl_ch0_128[4], dl_ch0_128[4]); + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + mmtmpD1 = vmull_s16(dl_ch0_128[5], dl_ch0_128[5]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD4 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + + } + + dl_ch_mag0_128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp0_128b); + dl_ch_mag0_128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp0_128b); + dl_ch_mag0_128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp0_128); + dl_ch_mag0_128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp0_128); + + + if (pilots==0) { + dl_ch_mag0_128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp0_128b); + dl_ch_mag0_128[2] = vqdmulhq_s16(mmtmpD4,QAM_amp0_128); + } } if (mod_order1>2) { - // get channel amplitude if not QPSK - mmtmpD0 = vmull_s16(dl_ch1_128[0], dl_ch1_128[0]); - // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3]; - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift on 32-bits - mmtmpD1 = vmull_s16(dl_ch1_128[1], dl_ch1_128[1]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD2 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift on 16-bits - mmtmpD0 = vmull_s16(dl_ch1_128[2], dl_ch1_128[2]); - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - mmtmpD1 = vmull_s16(dl_ch1_128[3], dl_ch1_128[3]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD3 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - if (pilots==0) { - mmtmpD0 = vmull_s16(dl_ch1_128[4], dl_ch1_128[4]); - mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift); - mmtmpD1 = vmull_s16(dl_ch1_128[5], dl_ch1_128[5]); - mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift); - mmtmpD4 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - - } - - dl_ch_mag1_128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp1_128b); - dl_ch_mag1_128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp1_128b); - dl_ch_mag1_128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp1_128); - dl_ch_mag1_128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp1_128); - - - if (pilots==0) { - dl_ch_mag1_128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp1_128b); - dl_ch_mag1_128[2] = vqdmulhq_s16(mmtmpD4,QAM_amp1_128); - } + // get channel amplitude if not QPSK + mmtmpD0 = vmull_s16(dl_ch1_128[0], dl_ch1_128[0]); + // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3]; + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift128 on 32-bits + mmtmpD1 = vmull_s16(dl_ch1_128[1], dl_ch1_128[1]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD2 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift128 on 16-bits + mmtmpD0 = vmull_s16(dl_ch1_128[2], dl_ch1_128[2]); + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + mmtmpD1 = vmull_s16(dl_ch1_128[3], dl_ch1_128[3]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD3 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + if (pilots==0) { + mmtmpD0 = vmull_s16(dl_ch1_128[4], dl_ch1_128[4]); + mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128); + mmtmpD1 = vmull_s16(dl_ch1_128[5], dl_ch1_128[5]); + mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128); + mmtmpD4 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + + } + + dl_ch_mag1_128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp1_128b); + dl_ch_mag1_128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp1_128b); + dl_ch_mag1_128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp1_128); + dl_ch_mag1_128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp1_128); + + + if (pilots==0) { + dl_ch_mag1_128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp1_128b); + dl_ch_mag1_128[2] = vqdmulhq_s16(mmtmpD4,QAM_amp1_128); + } } - - mmtmpD0 = vmull_s16(dl_ch0_128[0], rx_dataF128[0]); - //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] - mmtmpD1 = vmull_s16(dl_ch0_128[1], rx_dataF128[1]); - //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] - - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rx_dataF128[0]); + + mmtmpD0 = vmull_s16(dl_ch0_128[0], rxdataF128[0]); + //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] + mmtmpD1 = vmull_s16(dl_ch0_128[1], rxdataF128[1]); + //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rxdataF128[0]); //mmtmpD0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rx_dataF128[1]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rxdataF128[1]); //mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); //mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp0_128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - mmtmpD0 = vmull_s16(dl_ch0_128[2], rx_dataF128[2]); - mmtmpD1 = vmull_s16(dl_ch0_128[3], rx_dataF128[3]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rx_dataF128[2]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rx_dataF128[3]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp0_128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - - mmtmpD0 = vmull_s16(dl_ch1_128[0], rx_dataF128[0]); - mmtmpD1 = vmull_s16(dl_ch1_128[1], rx_dataF128[1]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rx_dataF128[0]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rx_dataF128[1]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp1_128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - mmtmpD0 = vmull_s16(dl_ch1_128[2], rx_dataF128[2]); - mmtmpD1 = vmull_s16(dl_ch1_128[3], rx_dataF128[3]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rx_dataF128[2]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rx_dataF128[3]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp1_128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp0_128[0] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + mmtmpD0 = vmull_s16(dl_ch0_128[2], rxdataF128[2]); + mmtmpD1 = vmull_s16(dl_ch0_128[3], rxdataF128[3]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rxdataF128[2]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rxdataF128[3]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp0_128[1] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + // second stream + mmtmpD0 = vmull_s16(dl_ch1_128[0], rxdataF128[0]); + mmtmpD1 = vmull_s16(dl_ch1_128[1], rxdataF128[1]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rxdataF128[0]); + + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rxdataF128[1]); + //mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + //mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp1_128[0] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + + mmtmpD0 = vmull_s16(dl_ch1_128[2], rxdataF128[2]); + mmtmpD1 = vmull_s16(dl_ch1_128[3], rxdataF128[3]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rxdataF128[2]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rxdataF128[3]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp1_128[1] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + if (pilots==0) { - mmtmpD0 = vmull_s16(dl_ch0_128[4], rx_dataF128[4]); - mmtmpD1 = vmull_s16(dl_ch0_128[5], rx_dataF128[5]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[4],*(int16x4_t*)conj)), rx_dataF128[4]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[5],*(int16x4_t*)conj)), rx_dataF128[5]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp0_128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - mmtmpD0 = vmull_s16(dl_ch1_128[4], rx_dataF128[4]); - mmtmpD1 = vmull_s16(dl_ch1_128[5], rx_dataF128[5]); - mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1); - mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch1_128[4],*(int16x4_t*)conj)), rx_dataF128[4]); - mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch1_128[5],*(int16x4_t*)conj)), rx_dataF128[5]); - mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1); - - mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift); - mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift); - rxdataF_comp1_128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1)); - - - dl_ch0_128+=6; - dl_ch1_128+=6; - dl_ch_mag0_128+=3; - dl_ch_mag0_128b+=3; - dl_ch_mag1_128+=3; - dl_ch_mag1_128b+=3; - rxdataF128+=6; - rxdataF_comp0_128+=3; - rxdataF_comp1_128+=3; - - } else { // we have a smaller PDSCH in symbols with pilots so skip last group of 4 REs and increment less - dl_ch0_128+=4; - dl_ch1_128+=4; - dl_ch_mag0_128+=2; - dl_ch_mag0_128b+=2; - dl_ch_mag1_128+=2; - dl_ch_mag1_128b+=2; - rxdataF128+=4; - rxdataF_comp0_128+=2; - rxdataF_comp1_128+=2; + mmtmpD0 = vmull_s16(dl_ch0_128[4], rxdataF128[4]); + mmtmpD1 = vmull_s16(dl_ch0_128[5], rxdataF128[5]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[4],*(int16x4_t*)conj)), rxdataF128[4]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[5],*(int16x4_t*)conj)), rxdataF128[5]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp0_128[2] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); + mmtmpD0 = vmull_s16(dl_ch1_128[4], rxdataF128[4]); + mmtmpD1 = vmull_s16(dl_ch1_128[5], rxdataF128[5]); + mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)), + vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1))); + + mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch1_128[4],*(int16x4_t*)conj)), rxdataF128[4]); + mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch1_128[5],*(int16x4_t*)conj)), rxdataF128[5]); + mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)), + vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b))); + + + mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128); + mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128); + rxdataF_comp1_128[2] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1)); } } + + + + Nre = (pilots==0) ? 12 : 8; + // rx_antennas + } - Nre = (pilots==0) ? 12 : 8; + Nre = (pilots==0) ? 12 : 8; - precoded_signal_strength0 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre], - (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx])); + precoded_signal_strength0 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre], + (nb_rb*Nre))) - (phy_measurements->n0_power[aarx])); - precoded_signal_strength1 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx+2][symbol*frame_parms->N_RB_DL*Nre], - (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx])); + precoded_signal_strength1 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx+2][symbol*frame_parms->N_RB_DL*Nre], + (nb_rb*Nre))) - (phy_measurements->n0_power[aarx])); - // rx_antennas - } phy_measurements->precoded_cqi_dB[eNB_id][0] = dB_fixed2(precoded_signal_strength0,phy_measurements->n0_power_tot); phy_measurements->precoded_cqi_dB[eNB_id][1] = dB_fixed2(precoded_signal_strength1,phy_measurements->n0_power_tot); @@ -2580,7 +2599,7 @@ void dlsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, unsigned char aatx; int i; __m128i *rxdataF_comp128_0,*rxdataF_comp128_1,*rxdataF_comp128_i0,*rxdataF_comp128_i1,*dl_ch_mag128_0,*dl_ch_mag128_1,*dl_ch_mag128_0b,*dl_ch_mag128_1b,*rho128_0,*rho128_1,*rho128_i0,*rho128_i1, - *dl_ch_mag128_i0,*dl_ch_mag128_i1,*dl_ch_mag128_i0b,*dl_ch_mag128_i1b; + *dl_ch_mag128_i0,*dl_ch_mag128_i1,*dl_ch_mag128_i0b,*dl_ch_mag128_i1b; if (frame_parms->nb_antennas_rx>1) { @@ -2638,6 +2657,62 @@ void dlsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, #elif defined(__arm__) + unsigned char aatx; + int i; + int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1,*rxdataF_comp128_i0,*rxdataF_comp128_i1,*dl_ch_mag128_0,*dl_ch_mag128_1,*dl_ch_mag128_0b,*dl_ch_mag128_1b,*rho128_0,*rho128_1,*rho128_i0,*rho128_i1,*dl_ch_mag128_i0,*dl_ch_mag128_i1,*dl_ch_mag128_i0b,*dl_ch_mag128_i1b; + + if (frame_parms->nb_antennas_rx>1) { + + for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) { + + rxdataF_comp128_0 = (int16x8_t *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128_1 = (int16x8_t *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_0 = (int16x8_t *)&dl_ch_mag[(aatx<<1)][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_1 = (int16x8_t *)&dl_ch_mag[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_0b = (int16x8_t *)&dl_ch_magb[(aatx<<1)][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_1b = (int16x8_t *)&dl_ch_magb[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12]; + + // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation) + for (i=0; i<nb_rb*3; i++) { + rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]); + dl_ch_mag128_0[i] = vhaddq_s16(dl_ch_mag128_0[i],dl_ch_mag128_1[i]); + dl_ch_mag128_0b[i] = vhaddq_s16(dl_ch_mag128_0b[i],dl_ch_mag128_1b[i]); + } + } + + if (rho) { + rho128_0 = (int16x8_t *) &rho[0][symbol*frame_parms->N_RB_DL*12]; + rho128_1 = (int16x8_t *) &rho[1][symbol*frame_parms->N_RB_DL*12]; + + for (i=0; i<nb_rb*3; i++) { + // print_shorts("mrc rho0:",&rho128_0[i]); + // print_shorts("mrc rho1:",&rho128_1[i]); + rho128_0[i] = vhaddq_s16(rho128_0[i],rho128_1[i]); + } + } + + + if (dual_stream_UE == 1) { + rho128_i0 = (int16x8_t *) &rho_i[0][symbol*frame_parms->N_RB_DL*12]; + rho128_i1 = (int16x8_t *) &rho_i[1][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128_i0 = (int16x8_t *)&rxdataF_comp_i[0][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128_i1 = (int16x8_t *)&rxdataF_comp_i[1][symbol*frame_parms->N_RB_DL*12]; + + dl_ch_mag128_i0 = (int16x8_t *)&dl_ch_mag_i[0][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_i1 = (int16x8_t *)&dl_ch_mag_i[1][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_i0b = (int16x8_t *)&dl_ch_magb_i[0][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_i1b = (int16x8_t *)&dl_ch_magb_i[1][symbol*frame_parms->N_RB_DL*12]; + + for (i=0; i<nb_rb*3; i++) { + rxdataF_comp128_i0[i] = vhaddq_s16(rxdataF_comp128_i0[i],rxdataF_comp128_i1[i]); + rho128_i0[i] = vhaddq_s16(rho128_i0[i],rho128_i1[i]); + + dl_ch_mag128_i0[i] = vhaddq_s16(dl_ch_mag128_i0[i],dl_ch_mag128_i1[i]); + dl_ch_mag128_i0b[i] = vhaddq_s16(dl_ch_mag128_i0b[i],dl_ch_mag128_i1b[i]); + } + } + } + #endif } @@ -2764,6 +2839,62 @@ void dlsch_channel_level(int **dl_ch_estimates_ext, #elif defined(__arm__) + short rb; + unsigned char aatx,aarx,nre=12,symbol_mod; + int32x4_t avg128D; + int16x4_t *dl_ch128; + + symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; + + for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) + for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { + //clear average level + avg128D = vdupq_n_s32(0); + // 5 is always a symbol with no pilots for both normal and extended prefix + + dl_ch128=(int16x4_t *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12]; + + for (rb=0; rb<nb_rb; rb++) { + // printf("rb %d : ",rb); + // print_shorts("ch",&dl_ch128[0]); + avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[0],dl_ch128[0])); + avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[1],dl_ch128[1])); + avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[2],dl_ch128[2])); + avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[3],dl_ch128[3])); + + if (((symbol_mod == 0) || (symbol_mod == (frame_parms->Ncp-1)))&&(frame_parms->mode1_flag==0)) { + dl_ch128+=4; + } else { + avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[4],dl_ch128[4])); + avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[5],dl_ch128[5])); + dl_ch128+=6; + } + + /* + if (rb==0) { + print_shorts("dl_ch128",&dl_ch128[0]); + print_shorts("dl_ch128",&dl_ch128[1]); + print_shorts("dl_ch128",&dl_ch128[2]); + } + */ + } + + if (((symbol_mod == 0) || (symbol_mod == (frame_parms->Ncp-1)))&&(frame_parms->mode1_flag==0)) + nre=8; + else if (((symbol_mod == 0) || (symbol_mod == (frame_parms->Ncp-1)))&&(frame_parms->mode1_flag==1)) + nre=10; + else + nre=12; + + avg[(aatx<<1)+aarx] = (((int32_t*)&avg128D)[0] + + ((int32_t*)&avg128D)[1] + + ((int32_t*)&avg128D)[2] + + ((int32_t*)&avg128D)[3])/(nb_rb*nre); + + // printf("Channel level : %d\n",avg[(aatx<<1)+aarx]); + } + + #endif } @@ -2832,9 +2963,9 @@ void dlsch_channel_level_TM3(int **dl_ch_estimates_ext, } avg[aarx] = (((int*)&avg128D)[0])/(nb_rb*nre) + - (((int*)&avg128D)[1])/(nb_rb*nre) + - (((int*)&avg128D)[2])/(nb_rb*nre) + - (((int*)&avg128D)[3])/(nb_rb*nre); + (((int*)&avg128D)[1])/(nb_rb*nre) + + (((int*)&avg128D)[2])/(nb_rb*nre) + + (((int*)&avg128D)[3])/(nb_rb*nre); } // choose maximum of the 2 effective channels @@ -2915,9 +3046,9 @@ void dlsch_channel_level_TM56(int **dl_ch_estimates_ext, } avg[aarx] = (((int*)&avg128D)[0])/(nb_rb*nre) + - (((int*)&avg128D)[1])/(nb_rb*nre) + - (((int*)&avg128D)[2])/(nb_rb*nre) + - (((int*)&avg128D)[3])/(nb_rb*nre); + (((int*)&avg128D)[1])/(nb_rb*nre) + + (((int*)&avg128D)[2])/(nb_rb*nre) + + (((int*)&avg128D)[3])/(nb_rb*nre); } // choose maximum of the 2 effective channels @@ -3109,17 +3240,17 @@ unsigned short dlsch_extract_rbs_single(int **rxdataF, memcpy(dl_ch0_ext,dl_ch0,12*sizeof(int)); /* - printf("rb %d\n",rb); - for (i=0;i<12;i++) - printf("(%d %d)",((short *)dl_ch0)[i<<1],((short*)dl_ch0)[1+(i<<1)]); - printf("\n"); + printf("rb %d\n",rb); + for (i=0;i<12;i++) + printf("(%d %d)",((short *)dl_ch0)[i<<1],((short*)dl_ch0)[1+(i<<1)]); + printf("\n"); */ if (pilots==0) { for (i=0; i<12; i++) { rxF_ext[i]=rxF[i]; /* - printf("%d : (%d,%d)\n",(rxF+i-&rxdataF[aarx][( (symbol*(frame_parms->ofdm_symbol_size)))]), - ((short*)&rxF[i])[0],((short*)&rxF[i])[1]);*/ + printf("%d : (%d,%d)\n",(rxF+i-&rxdataF[aarx][( (symbol*(frame_parms->ofdm_symbol_size)))]), + ((short*)&rxF[i])[0],((short*)&rxF[i])[1]);*/ } dl_ch0_ext+=12; @@ -3461,7 +3592,7 @@ unsigned short dlsch_extract_rbs_single(int **rxdataF, if (rb_alloc_ind==1) { // printf("rb %d/symbol %d (skip_half %d)\n",rb,l,skip_half); /* - printf("rb %d\n",rb); + printf("rb %d\n",rb); for (i=0;i<12;i++) printf("(%d %d)",((short *)dl_ch0)[i<<1],((short*)dl_ch0)[1+(i<<1)]); printf("\n"); @@ -3545,8 +3676,6 @@ unsigned short dlsch_extract_rbs_single(int **rxdataF, } } - _mm_empty(); - _m_empty(); return(nb_rb/frame_parms->nb_antennas_rx); } @@ -4172,9 +4301,6 @@ unsigned short dlsch_extract_rbs_dual(int **rxdataF, } - _mm_empty(); - _m_empty(); - return(nb_rb/frame_parms->nb_antennas_rx); } @@ -4266,7 +4392,7 @@ void print_bytes(char *s,__m128i *x) printf("%s : %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s, tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7], tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15] - ); + ); } diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c index 4990e94cf5..f0a32e09b8 100644 --- a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c +++ b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c @@ -25,18 +25,18 @@ Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE - *******************************************************************************/ +*******************************************************************************/ /*! \file PHY/LTE_TRANSPORT/dlsch_llr_computation.c -* \brief Top-level routines for LLR computation of the PDSCH physical channel from 36-211, V8.6 2009-03 -* \author R. Knopp, F. Kaltenberger,A. Bhamri, S. Aubert, S. Wagner -* \date 2011 -* \version 0.1 -* \company Eurecom -* \email: knopp@eurecom.fr,florian.kaltenberger@eurecom.fr,ankit.bhamri@eurecom.fr,sebastien.aubert@eurecom.fr, sebastian.wagner@eurecom.fr -* \note -* \warning -*/ + * \brief Top-level routines for LLR computation of the PDSCH physical channel from 36-211, V8.6 2009-03 + * \author R. Knopp, F. Kaltenberger,A. Bhamri, S. Aubert, S. Wagner + * \date 2011 + * \version 0.1 + * \company Eurecom + * \email: knopp@eurecom.fr,florian.kaltenberger@eurecom.fr,ankit.bhamri@eurecom.fr,sebastien.aubert@eurecom.fr, sebastian.wagner@eurecom.fr + * \note + * \warning + */ #include "PHY/defs.h" #include "PHY/extern.h" @@ -44,567 +44,563 @@ #include "extern.h" #include "PHY/sse_intrin.h" -#ifndef USER_MODE -#define NOCYGWIN_STATIC static -#else -#define NOCYGWIN_STATIC -#endif +int16_t zero[8] __attribute__ ((aligned(16))) = {0,0,0,0,0,0,0,0}; +int16_t ones[8] __attribute__ ((aligned(16))) = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff}; +#if defined(__x86_64__) || defined(__i386__) +__m128i rho_rpi __attribute__ ((aligned(16))); +__m128i rho_rmi __attribute__ ((aligned(16))); +__m128i rho_rpi_1_1 __attribute__ ((aligned(16))); +__m128i rho_rpi_1_3 __attribute__ ((aligned(16))); +__m128i rho_rpi_1_5 __attribute__ ((aligned(16))); +__m128i rho_rpi_1_7 __attribute__ ((aligned(16))); +__m128i rho_rpi_3_1 __attribute__ ((aligned(16))); +__m128i rho_rpi_3_3 __attribute__ ((aligned(16))); +__m128i rho_rpi_3_5 __attribute__ ((aligned(16))); +__m128i rho_rpi_3_7 __attribute__ ((aligned(16))); +__m128i rho_rpi_5_1 __attribute__ ((aligned(16))); +__m128i rho_rpi_5_3 __attribute__ ((aligned(16))); +__m128i rho_rpi_5_5 __attribute__ ((aligned(16))); +__m128i rho_rpi_5_7 __attribute__ ((aligned(16))); +__m128i rho_rpi_7_1 __attribute__ ((aligned(16))); +__m128i rho_rpi_7_3 __attribute__ ((aligned(16))); +__m128i rho_rpi_7_5 __attribute__ ((aligned(16))); +__m128i rho_rpi_7_7 __attribute__ ((aligned(16))); +__m128i rho_rmi_1_1 __attribute__ ((aligned(16))); +__m128i rho_rmi_1_3 __attribute__ ((aligned(16))); +__m128i rho_rmi_1_5 __attribute__ ((aligned(16))); +__m128i rho_rmi_1_7 __attribute__ ((aligned(16))); +__m128i rho_rmi_3_1 __attribute__ ((aligned(16))); +__m128i rho_rmi_3_3 __attribute__ ((aligned(16))); +__m128i rho_rmi_3_5 __attribute__ ((aligned(16))); +__m128i rho_rmi_3_7 __attribute__ ((aligned(16))); +__m128i rho_rmi_5_1 __attribute__ ((aligned(16))); +__m128i rho_rmi_5_3 __attribute__ ((aligned(16))); +__m128i rho_rmi_5_5 __attribute__ ((aligned(16))); +__m128i rho_rmi_5_7 __attribute__ ((aligned(16))); +__m128i rho_rmi_7_1 __attribute__ ((aligned(16))); +__m128i rho_rmi_7_3 __attribute__ ((aligned(16))); +__m128i rho_rmi_7_5 __attribute__ ((aligned(16))); +__m128i rho_rmi_7_7 __attribute__ ((aligned(16))); + +__m128i psi_r_m7_m7 __attribute__ ((aligned(16))); +__m128i psi_r_m7_m5 __attribute__ ((aligned(16))); +__m128i psi_r_m7_m3 __attribute__ ((aligned(16))); +__m128i psi_r_m7_m1 __attribute__ ((aligned(16))); +__m128i psi_r_m7_p1 __attribute__ ((aligned(16))); +__m128i psi_r_m7_p3 __attribute__ ((aligned(16))); +__m128i psi_r_m7_p5 __attribute__ ((aligned(16))); +__m128i psi_r_m7_p7 __attribute__ ((aligned(16))); +__m128i psi_r_m5_m7 __attribute__ ((aligned(16))); +__m128i psi_r_m5_m5 __attribute__ ((aligned(16))); +__m128i psi_r_m5_m3 __attribute__ ((aligned(16))); +__m128i psi_r_m5_m1 __attribute__ ((aligned(16))); +__m128i psi_r_m5_p1 __attribute__ ((aligned(16))); +__m128i psi_r_m5_p3 __attribute__ ((aligned(16))); +__m128i psi_r_m5_p5 __attribute__ ((aligned(16))); +__m128i psi_r_m5_p7 __attribute__ ((aligned(16))); +__m128i psi_r_m3_m7 __attribute__ ((aligned(16))); +__m128i psi_r_m3_m5 __attribute__ ((aligned(16))); +__m128i psi_r_m3_m3 __attribute__ ((aligned(16))); +__m128i psi_r_m3_m1 __attribute__ ((aligned(16))); +__m128i psi_r_m3_p1 __attribute__ ((aligned(16))); +__m128i psi_r_m3_p3 __attribute__ ((aligned(16))); +__m128i psi_r_m3_p5 __attribute__ ((aligned(16))); +__m128i psi_r_m3_p7 __attribute__ ((aligned(16))); +__m128i psi_r_m1_m7 __attribute__ ((aligned(16))); +__m128i psi_r_m1_m5 __attribute__ ((aligned(16))); +__m128i psi_r_m1_m3 __attribute__ ((aligned(16))); +__m128i psi_r_m1_m1 __attribute__ ((aligned(16))); +__m128i psi_r_m1_p1 __attribute__ ((aligned(16))); +__m128i psi_r_m1_p3 __attribute__ ((aligned(16))); +__m128i psi_r_m1_p5 __attribute__ ((aligned(16))); +__m128i psi_r_m1_p7 __attribute__ ((aligned(16))); +__m128i psi_r_p1_m7 __attribute__ ((aligned(16))); +__m128i psi_r_p1_m5 __attribute__ ((aligned(16))); +__m128i psi_r_p1_m3 __attribute__ ((aligned(16))); +__m128i psi_r_p1_m1 __attribute__ ((aligned(16))); +__m128i psi_r_p1_p1 __attribute__ ((aligned(16))); +__m128i psi_r_p1_p3 __attribute__ ((aligned(16))); +__m128i psi_r_p1_p5 __attribute__ ((aligned(16))); +__m128i psi_r_p1_p7 __attribute__ ((aligned(16))); +__m128i psi_r_p3_m7 __attribute__ ((aligned(16))); +__m128i psi_r_p3_m5 __attribute__ ((aligned(16))); +__m128i psi_r_p3_m3 __attribute__ ((aligned(16))); +__m128i psi_r_p3_m1 __attribute__ ((aligned(16))); +__m128i psi_r_p3_p1 __attribute__ ((aligned(16))); +__m128i psi_r_p3_p3 __attribute__ ((aligned(16))); +__m128i psi_r_p3_p5 __attribute__ ((aligned(16))); +__m128i psi_r_p3_p7 __attribute__ ((aligned(16))); +__m128i psi_r_p5_m7 __attribute__ ((aligned(16))); +__m128i psi_r_p5_m5 __attribute__ ((aligned(16))); +__m128i psi_r_p5_m3 __attribute__ ((aligned(16))); +__m128i psi_r_p5_m1 __attribute__ ((aligned(16))); +__m128i psi_r_p5_p1 __attribute__ ((aligned(16))); +__m128i psi_r_p5_p3 __attribute__ ((aligned(16))); +__m128i psi_r_p5_p5 __attribute__ ((aligned(16))); +__m128i psi_r_p5_p7 __attribute__ ((aligned(16))); +__m128i psi_r_p7_m7 __attribute__ ((aligned(16))); +__m128i psi_r_p7_m5 __attribute__ ((aligned(16))); +__m128i psi_r_p7_m3 __attribute__ ((aligned(16))); +__m128i psi_r_p7_m1 __attribute__ ((aligned(16))); +__m128i psi_r_p7_p1 __attribute__ ((aligned(16))); +__m128i psi_r_p7_p3 __attribute__ ((aligned(16))); +__m128i psi_r_p7_p5 __attribute__ ((aligned(16))); +__m128i psi_r_p7_p7 __attribute__ ((aligned(16))); + +__m128i psi_i_m7_m7 __attribute__ ((aligned(16))); +__m128i psi_i_m7_m5 __attribute__ ((aligned(16))); +__m128i psi_i_m7_m3 __attribute__ ((aligned(16))); +__m128i psi_i_m7_m1 __attribute__ ((aligned(16))); +__m128i psi_i_m7_p1 __attribute__ ((aligned(16))); +__m128i psi_i_m7_p3 __attribute__ ((aligned(16))); +__m128i psi_i_m7_p5 __attribute__ ((aligned(16))); +__m128i psi_i_m7_p7 __attribute__ ((aligned(16))); +__m128i psi_i_m5_m7 __attribute__ ((aligned(16))); +__m128i psi_i_m5_m5 __attribute__ ((aligned(16))); +__m128i psi_i_m5_m3 __attribute__ ((aligned(16))); +__m128i psi_i_m5_m1 __attribute__ ((aligned(16))); +__m128i psi_i_m5_p1 __attribute__ ((aligned(16))); +__m128i psi_i_m5_p3 __attribute__ ((aligned(16))); +__m128i psi_i_m5_p5 __attribute__ ((aligned(16))); +__m128i psi_i_m5_p7 __attribute__ ((aligned(16))); +__m128i psi_i_m3_m7 __attribute__ ((aligned(16))); +__m128i psi_i_m3_m5 __attribute__ ((aligned(16))); +__m128i psi_i_m3_m3 __attribute__ ((aligned(16))); +__m128i psi_i_m3_m1 __attribute__ ((aligned(16))); +__m128i psi_i_m3_p1 __attribute__ ((aligned(16))); +__m128i psi_i_m3_p3 __attribute__ ((aligned(16))); +__m128i psi_i_m3_p5 __attribute__ ((aligned(16))); +__m128i psi_i_m3_p7 __attribute__ ((aligned(16))); +__m128i psi_i_m1_m7 __attribute__ ((aligned(16))); +__m128i psi_i_m1_m5 __attribute__ ((aligned(16))); +__m128i psi_i_m1_m3 __attribute__ ((aligned(16))); +__m128i psi_i_m1_m1 __attribute__ ((aligned(16))); +__m128i psi_i_m1_p1 __attribute__ ((aligned(16))); +__m128i psi_i_m1_p3 __attribute__ ((aligned(16))); +__m128i psi_i_m1_p5 __attribute__ ((aligned(16))); +__m128i psi_i_m1_p7 __attribute__ ((aligned(16))); +__m128i psi_i_p1_m7 __attribute__ ((aligned(16))); +__m128i psi_i_p1_m5 __attribute__ ((aligned(16))); +__m128i psi_i_p1_m3 __attribute__ ((aligned(16))); +__m128i psi_i_p1_m1 __attribute__ ((aligned(16))); +__m128i psi_i_p1_p1 __attribute__ ((aligned(16))); +__m128i psi_i_p1_p3 __attribute__ ((aligned(16))); +__m128i psi_i_p1_p5 __attribute__ ((aligned(16))); +__m128i psi_i_p1_p7 __attribute__ ((aligned(16))); +__m128i psi_i_p3_m7 __attribute__ ((aligned(16))); +__m128i psi_i_p3_m5 __attribute__ ((aligned(16))); +__m128i psi_i_p3_m3 __attribute__ ((aligned(16))); +__m128i psi_i_p3_m1 __attribute__ ((aligned(16))); +__m128i psi_i_p3_p1 __attribute__ ((aligned(16))); +__m128i psi_i_p3_p3 __attribute__ ((aligned(16))); +__m128i psi_i_p3_p5 __attribute__ ((aligned(16))); +__m128i psi_i_p3_p7 __attribute__ ((aligned(16))); +__m128i psi_i_p5_m7 __attribute__ ((aligned(16))); +__m128i psi_i_p5_m5 __attribute__ ((aligned(16))); +__m128i psi_i_p5_m3 __attribute__ ((aligned(16))); +__m128i psi_i_p5_m1 __attribute__ ((aligned(16))); +__m128i psi_i_p5_p1 __attribute__ ((aligned(16))); +__m128i psi_i_p5_p3 __attribute__ ((aligned(16))); +__m128i psi_i_p5_p5 __attribute__ ((aligned(16))); +__m128i psi_i_p5_p7 __attribute__ ((aligned(16))); +__m128i psi_i_p7_m7 __attribute__ ((aligned(16))); +__m128i psi_i_p7_m5 __attribute__ ((aligned(16))); +__m128i psi_i_p7_m3 __attribute__ ((aligned(16))); +__m128i psi_i_p7_m1 __attribute__ ((aligned(16))); +__m128i psi_i_p7_p1 __attribute__ ((aligned(16))); +__m128i psi_i_p7_p3 __attribute__ ((aligned(16))); +__m128i psi_i_p7_p5 __attribute__ ((aligned(16))); +__m128i psi_i_p7_p7 __attribute__ ((aligned(16))); + +__m128i a_r_m7_m7 __attribute__ ((aligned(16))); +__m128i a_r_m7_m5 __attribute__ ((aligned(16))); +__m128i a_r_m7_m3 __attribute__ ((aligned(16))); +__m128i a_r_m7_m1 __attribute__ ((aligned(16))); +__m128i a_r_m7_p1 __attribute__ ((aligned(16))); +__m128i a_r_m7_p3 __attribute__ ((aligned(16))); +__m128i a_r_m7_p5 __attribute__ ((aligned(16))); +__m128i a_r_m7_p7 __attribute__ ((aligned(16))); +__m128i a_r_m5_m7 __attribute__ ((aligned(16))); +__m128i a_r_m5_m5 __attribute__ ((aligned(16))); +__m128i a_r_m5_m3 __attribute__ ((aligned(16))); +__m128i a_r_m5_m1 __attribute__ ((aligned(16))); +__m128i a_r_m5_p1 __attribute__ ((aligned(16))); +__m128i a_r_m5_p3 __attribute__ ((aligned(16))); +__m128i a_r_m5_p5 __attribute__ ((aligned(16))); +__m128i a_r_m5_p7 __attribute__ ((aligned(16))); +__m128i a_r_m3_m7 __attribute__ ((aligned(16))); +__m128i a_r_m3_m5 __attribute__ ((aligned(16))); +__m128i a_r_m3_m3 __attribute__ ((aligned(16))); +__m128i a_r_m3_m1 __attribute__ ((aligned(16))); +__m128i a_r_m3_p1 __attribute__ ((aligned(16))); +__m128i a_r_m3_p3 __attribute__ ((aligned(16))); +__m128i a_r_m3_p5 __attribute__ ((aligned(16))); +__m128i a_r_m3_p7 __attribute__ ((aligned(16))); +__m128i a_r_m1_m7 __attribute__ ((aligned(16))); +__m128i a_r_m1_m5 __attribute__ ((aligned(16))); +__m128i a_r_m1_m3 __attribute__ ((aligned(16))); +__m128i a_r_m1_m1 __attribute__ ((aligned(16))); +__m128i a_r_m1_p1 __attribute__ ((aligned(16))); +__m128i a_r_m1_p3 __attribute__ ((aligned(16))); +__m128i a_r_m1_p5 __attribute__ ((aligned(16))); +__m128i a_r_m1_p7 __attribute__ ((aligned(16))); +__m128i a_r_p1_m7 __attribute__ ((aligned(16))); +__m128i a_r_p1_m5 __attribute__ ((aligned(16))); +__m128i a_r_p1_m3 __attribute__ ((aligned(16))); +__m128i a_r_p1_m1 __attribute__ ((aligned(16))); +__m128i a_r_p1_p1 __attribute__ ((aligned(16))); +__m128i a_r_p1_p3 __attribute__ ((aligned(16))); +__m128i a_r_p1_p5 __attribute__ ((aligned(16))); +__m128i a_r_p1_p7 __attribute__ ((aligned(16))); +__m128i a_r_p3_m7 __attribute__ ((aligned(16))); +__m128i a_r_p3_m5 __attribute__ ((aligned(16))); +__m128i a_r_p3_m3 __attribute__ ((aligned(16))); +__m128i a_r_p3_m1 __attribute__ ((aligned(16))); +__m128i a_r_p3_p1 __attribute__ ((aligned(16))); +__m128i a_r_p3_p3 __attribute__ ((aligned(16))); +__m128i a_r_p3_p5 __attribute__ ((aligned(16))); +__m128i a_r_p3_p7 __attribute__ ((aligned(16))); +__m128i a_r_p5_m7 __attribute__ ((aligned(16))); +__m128i a_r_p5_m5 __attribute__ ((aligned(16))); +__m128i a_r_p5_m3 __attribute__ ((aligned(16))); +__m128i a_r_p5_m1 __attribute__ ((aligned(16))); +__m128i a_r_p5_p1 __attribute__ ((aligned(16))); +__m128i a_r_p5_p3 __attribute__ ((aligned(16))); +__m128i a_r_p5_p5 __attribute__ ((aligned(16))); +__m128i a_r_p5_p7 __attribute__ ((aligned(16))); +__m128i a_r_p7_m7 __attribute__ ((aligned(16))); +__m128i a_r_p7_m5 __attribute__ ((aligned(16))); +__m128i a_r_p7_m3 __attribute__ ((aligned(16))); +__m128i a_r_p7_m1 __attribute__ ((aligned(16))); +__m128i a_r_p7_p1 __attribute__ ((aligned(16))); +__m128i a_r_p7_p3 __attribute__ ((aligned(16))); +__m128i a_r_p7_p5 __attribute__ ((aligned(16))); +__m128i a_r_p7_p7 __attribute__ ((aligned(16))); + +__m128i a_i_m7_m7 __attribute__ ((aligned(16))); +__m128i a_i_m7_m5 __attribute__ ((aligned(16))); +__m128i a_i_m7_m3 __attribute__ ((aligned(16))); +__m128i a_i_m7_m1 __attribute__ ((aligned(16))); +__m128i a_i_m7_p1 __attribute__ ((aligned(16))); +__m128i a_i_m7_p3 __attribute__ ((aligned(16))); +__m128i a_i_m7_p5 __attribute__ ((aligned(16))); +__m128i a_i_m7_p7 __attribute__ ((aligned(16))); +__m128i a_i_m5_m7 __attribute__ ((aligned(16))); +__m128i a_i_m5_m5 __attribute__ ((aligned(16))); +__m128i a_i_m5_m3 __attribute__ ((aligned(16))); +__m128i a_i_m5_m1 __attribute__ ((aligned(16))); +__m128i a_i_m5_p1 __attribute__ ((aligned(16))); +__m128i a_i_m5_p3 __attribute__ ((aligned(16))); +__m128i a_i_m5_p5 __attribute__ ((aligned(16))); +__m128i a_i_m5_p7 __attribute__ ((aligned(16))); +__m128i a_i_m3_m7 __attribute__ ((aligned(16))); +__m128i a_i_m3_m5 __attribute__ ((aligned(16))); +__m128i a_i_m3_m3 __attribute__ ((aligned(16))); +__m128i a_i_m3_m1 __attribute__ ((aligned(16))); +__m128i a_i_m3_p1 __attribute__ ((aligned(16))); +__m128i a_i_m3_p3 __attribute__ ((aligned(16))); +__m128i a_i_m3_p5 __attribute__ ((aligned(16))); +__m128i a_i_m3_p7 __attribute__ ((aligned(16))); +__m128i a_i_m1_m7 __attribute__ ((aligned(16))); +__m128i a_i_m1_m5 __attribute__ ((aligned(16))); +__m128i a_i_m1_m3 __attribute__ ((aligned(16))); +__m128i a_i_m1_m1 __attribute__ ((aligned(16))); +__m128i a_i_m1_p1 __attribute__ ((aligned(16))); +__m128i a_i_m1_p3 __attribute__ ((aligned(16))); +__m128i a_i_m1_p5 __attribute__ ((aligned(16))); +__m128i a_i_m1_p7 __attribute__ ((aligned(16))); +__m128i a_i_p1_m7 __attribute__ ((aligned(16))); +__m128i a_i_p1_m5 __attribute__ ((aligned(16))); +__m128i a_i_p1_m3 __attribute__ ((aligned(16))); +__m128i a_i_p1_m1 __attribute__ ((aligned(16))); +__m128i a_i_p1_p1 __attribute__ ((aligned(16))); +__m128i a_i_p1_p3 __attribute__ ((aligned(16))); +__m128i a_i_p1_p5 __attribute__ ((aligned(16))); +__m128i a_i_p1_p7 __attribute__ ((aligned(16))); +__m128i a_i_p3_m7 __attribute__ ((aligned(16))); +__m128i a_i_p3_m5 __attribute__ ((aligned(16))); +__m128i a_i_p3_m3 __attribute__ ((aligned(16))); +__m128i a_i_p3_m1 __attribute__ ((aligned(16))); +__m128i a_i_p3_p1 __attribute__ ((aligned(16))); +__m128i a_i_p3_p3 __attribute__ ((aligned(16))); +__m128i a_i_p3_p5 __attribute__ ((aligned(16))); +__m128i a_i_p3_p7 __attribute__ ((aligned(16))); +__m128i a_i_p5_m7 __attribute__ ((aligned(16))); +__m128i a_i_p5_m5 __attribute__ ((aligned(16))); +__m128i a_i_p5_m3 __attribute__ ((aligned(16))); +__m128i a_i_p5_m1 __attribute__ ((aligned(16))); +__m128i a_i_p5_p1 __attribute__ ((aligned(16))); +__m128i a_i_p5_p3 __attribute__ ((aligned(16))); +__m128i a_i_p5_p5 __attribute__ ((aligned(16))); +__m128i a_i_p5_p7 __attribute__ ((aligned(16))); +__m128i a_i_p7_m7 __attribute__ ((aligned(16))); +__m128i a_i_p7_m5 __attribute__ ((aligned(16))); +__m128i a_i_p7_m3 __attribute__ ((aligned(16))); +__m128i a_i_p7_m1 __attribute__ ((aligned(16))); +__m128i a_i_p7_p1 __attribute__ ((aligned(16))); +__m128i a_i_p7_p3 __attribute__ ((aligned(16))); +__m128i a_i_p7_p5 __attribute__ ((aligned(16))); +__m128i a_i_p7_p7 __attribute__ ((aligned(16))); + +__m128i psi_a_m7_m7 __attribute__ ((aligned(16))); +__m128i psi_a_m7_m5 __attribute__ ((aligned(16))); +__m128i psi_a_m7_m3 __attribute__ ((aligned(16))); +__m128i psi_a_m7_m1 __attribute__ ((aligned(16))); +__m128i psi_a_m7_p1 __attribute__ ((aligned(16))); +__m128i psi_a_m7_p3 __attribute__ ((aligned(16))); +__m128i psi_a_m7_p5 __attribute__ ((aligned(16))); +__m128i psi_a_m7_p7 __attribute__ ((aligned(16))); +__m128i psi_a_m5_m7 __attribute__ ((aligned(16))); +__m128i psi_a_m5_m5 __attribute__ ((aligned(16))); +__m128i psi_a_m5_m3 __attribute__ ((aligned(16))); +__m128i psi_a_m5_m1 __attribute__ ((aligned(16))); +__m128i psi_a_m5_p1 __attribute__ ((aligned(16))); +__m128i psi_a_m5_p3 __attribute__ ((aligned(16))); +__m128i psi_a_m5_p5 __attribute__ ((aligned(16))); +__m128i psi_a_m5_p7 __attribute__ ((aligned(16))); +__m128i psi_a_m3_m7 __attribute__ ((aligned(16))); +__m128i psi_a_m3_m5 __attribute__ ((aligned(16))); +__m128i psi_a_m3_m3 __attribute__ ((aligned(16))); +__m128i psi_a_m3_m1 __attribute__ ((aligned(16))); +__m128i psi_a_m3_p1 __attribute__ ((aligned(16))); +__m128i psi_a_m3_p3 __attribute__ ((aligned(16))); +__m128i psi_a_m3_p5 __attribute__ ((aligned(16))); +__m128i psi_a_m3_p7 __attribute__ ((aligned(16))); +__m128i psi_a_m1_m7 __attribute__ ((aligned(16))); +__m128i psi_a_m1_m5 __attribute__ ((aligned(16))); +__m128i psi_a_m1_m3 __attribute__ ((aligned(16))); +__m128i psi_a_m1_m1 __attribute__ ((aligned(16))); +__m128i psi_a_m1_p1 __attribute__ ((aligned(16))); +__m128i psi_a_m1_p3 __attribute__ ((aligned(16))); +__m128i psi_a_m1_p5 __attribute__ ((aligned(16))); +__m128i psi_a_m1_p7 __attribute__ ((aligned(16))); +__m128i psi_a_p1_m7 __attribute__ ((aligned(16))); +__m128i psi_a_p1_m5 __attribute__ ((aligned(16))); +__m128i psi_a_p1_m3 __attribute__ ((aligned(16))); +__m128i psi_a_p1_m1 __attribute__ ((aligned(16))); +__m128i psi_a_p1_p1 __attribute__ ((aligned(16))); +__m128i psi_a_p1_p3 __attribute__ ((aligned(16))); +__m128i psi_a_p1_p5 __attribute__ ((aligned(16))); +__m128i psi_a_p1_p7 __attribute__ ((aligned(16))); +__m128i psi_a_p3_m7 __attribute__ ((aligned(16))); +__m128i psi_a_p3_m5 __attribute__ ((aligned(16))); +__m128i psi_a_p3_m3 __attribute__ ((aligned(16))); +__m128i psi_a_p3_m1 __attribute__ ((aligned(16))); +__m128i psi_a_p3_p1 __attribute__ ((aligned(16))); +__m128i psi_a_p3_p3 __attribute__ ((aligned(16))); +__m128i psi_a_p3_p5 __attribute__ ((aligned(16))); +__m128i psi_a_p3_p7 __attribute__ ((aligned(16))); +__m128i psi_a_p5_m7 __attribute__ ((aligned(16))); +__m128i psi_a_p5_m5 __attribute__ ((aligned(16))); +__m128i psi_a_p5_m3 __attribute__ ((aligned(16))); +__m128i psi_a_p5_m1 __attribute__ ((aligned(16))); +__m128i psi_a_p5_p1 __attribute__ ((aligned(16))); +__m128i psi_a_p5_p3 __attribute__ ((aligned(16))); +__m128i psi_a_p5_p5 __attribute__ ((aligned(16))); +__m128i psi_a_p5_p7 __attribute__ ((aligned(16))); +__m128i psi_a_p7_m7 __attribute__ ((aligned(16))); +__m128i psi_a_p7_m5 __attribute__ ((aligned(16))); +__m128i psi_a_p7_m3 __attribute__ ((aligned(16))); +__m128i psi_a_p7_m1 __attribute__ ((aligned(16))); +__m128i psi_a_p7_p1 __attribute__ ((aligned(16))); +__m128i psi_a_p7_p3 __attribute__ ((aligned(16))); +__m128i psi_a_p7_p5 __attribute__ ((aligned(16))); +__m128i psi_a_p7_p7 __attribute__ ((aligned(16))); + +__m128i a_sq_m7_m7 __attribute__ ((aligned(16))); +__m128i a_sq_m7_m5 __attribute__ ((aligned(16))); +__m128i a_sq_m7_m3 __attribute__ ((aligned(16))); +__m128i a_sq_m7_m1 __attribute__ ((aligned(16))); +__m128i a_sq_m7_p1 __attribute__ ((aligned(16))); +__m128i a_sq_m7_p3 __attribute__ ((aligned(16))); +__m128i a_sq_m7_p5 __attribute__ ((aligned(16))); +__m128i a_sq_m7_p7 __attribute__ ((aligned(16))); +__m128i a_sq_m5_m7 __attribute__ ((aligned(16))); +__m128i a_sq_m5_m5 __attribute__ ((aligned(16))); +__m128i a_sq_m5_m3 __attribute__ ((aligned(16))); +__m128i a_sq_m5_m1 __attribute__ ((aligned(16))); +__m128i a_sq_m5_p1 __attribute__ ((aligned(16))); +__m128i a_sq_m5_p3 __attribute__ ((aligned(16))); +__m128i a_sq_m5_p5 __attribute__ ((aligned(16))); +__m128i a_sq_m5_p7 __attribute__ ((aligned(16))); +__m128i a_sq_m3_m7 __attribute__ ((aligned(16))); +__m128i a_sq_m3_m5 __attribute__ ((aligned(16))); +__m128i a_sq_m3_m3 __attribute__ ((aligned(16))); +__m128i a_sq_m3_m1 __attribute__ ((aligned(16))); +__m128i a_sq_m3_p1 __attribute__ ((aligned(16))); +__m128i a_sq_m3_p3 __attribute__ ((aligned(16))); +__m128i a_sq_m3_p5 __attribute__ ((aligned(16))); +__m128i a_sq_m3_p7 __attribute__ ((aligned(16))); +__m128i a_sq_m1_m7 __attribute__ ((aligned(16))); +__m128i a_sq_m1_m5 __attribute__ ((aligned(16))); +__m128i a_sq_m1_m3 __attribute__ ((aligned(16))); +__m128i a_sq_m1_m1 __attribute__ ((aligned(16))); +__m128i a_sq_m1_p1 __attribute__ ((aligned(16))); +__m128i a_sq_m1_p3 __attribute__ ((aligned(16))); +__m128i a_sq_m1_p5 __attribute__ ((aligned(16))); +__m128i a_sq_m1_p7 __attribute__ ((aligned(16))); +__m128i a_sq_p1_m7 __attribute__ ((aligned(16))); +__m128i a_sq_p1_m5 __attribute__ ((aligned(16))); +__m128i a_sq_p1_m3 __attribute__ ((aligned(16))); +__m128i a_sq_p1_m1 __attribute__ ((aligned(16))); +__m128i a_sq_p1_p1 __attribute__ ((aligned(16))); +__m128i a_sq_p1_p3 __attribute__ ((aligned(16))); +__m128i a_sq_p1_p5 __attribute__ ((aligned(16))); +__m128i a_sq_p1_p7 __attribute__ ((aligned(16))); +__m128i a_sq_p3_m7 __attribute__ ((aligned(16))); +__m128i a_sq_p3_m5 __attribute__ ((aligned(16))); +__m128i a_sq_p3_m3 __attribute__ ((aligned(16))); +__m128i a_sq_p3_m1 __attribute__ ((aligned(16))); +__m128i a_sq_p3_p1 __attribute__ ((aligned(16))); +__m128i a_sq_p3_p3 __attribute__ ((aligned(16))); +__m128i a_sq_p3_p5 __attribute__ ((aligned(16))); +__m128i a_sq_p3_p7 __attribute__ ((aligned(16))); +__m128i a_sq_p5_m7 __attribute__ ((aligned(16))); +__m128i a_sq_p5_m5 __attribute__ ((aligned(16))); +__m128i a_sq_p5_m3 __attribute__ ((aligned(16))); +__m128i a_sq_p5_m1 __attribute__ ((aligned(16))); +__m128i a_sq_p5_p1 __attribute__ ((aligned(16))); +__m128i a_sq_p5_p3 __attribute__ ((aligned(16))); +__m128i a_sq_p5_p5 __attribute__ ((aligned(16))); +__m128i a_sq_p5_p7 __attribute__ ((aligned(16))); +__m128i a_sq_p7_m7 __attribute__ ((aligned(16))); +__m128i a_sq_p7_m5 __attribute__ ((aligned(16))); +__m128i a_sq_p7_m3 __attribute__ ((aligned(16))); +__m128i a_sq_p7_m1 __attribute__ ((aligned(16))); +__m128i a_sq_p7_p1 __attribute__ ((aligned(16))); +__m128i a_sq_p7_p3 __attribute__ ((aligned(16))); +__m128i a_sq_p7_p5 __attribute__ ((aligned(16))); +__m128i a_sq_p7_p7 __attribute__ ((aligned(16))); + +__m128i bit_met_m7_m7 __attribute__ ((aligned(16))); +__m128i bit_met_m7_m5 __attribute__ ((aligned(16))); +__m128i bit_met_m7_m3 __attribute__ ((aligned(16))); +__m128i bit_met_m7_m1 __attribute__ ((aligned(16))); +__m128i bit_met_m7_p1 __attribute__ ((aligned(16))); +__m128i bit_met_m7_p3 __attribute__ ((aligned(16))); +__m128i bit_met_m7_p5 __attribute__ ((aligned(16))); +__m128i bit_met_m7_p7 __attribute__ ((aligned(16))); +__m128i bit_met_m5_m7 __attribute__ ((aligned(16))); +__m128i bit_met_m5_m5 __attribute__ ((aligned(16))); +__m128i bit_met_m5_m3 __attribute__ ((aligned(16))); +__m128i bit_met_m5_m1 __attribute__ ((aligned(16))); +__m128i bit_met_m5_p1 __attribute__ ((aligned(16))); +__m128i bit_met_m5_p3 __attribute__ ((aligned(16))); +__m128i bit_met_m5_p5 __attribute__ ((aligned(16))); +__m128i bit_met_m5_p7 __attribute__ ((aligned(16))); +__m128i bit_met_m3_m7 __attribute__ ((aligned(16))); +__m128i bit_met_m3_m5 __attribute__ ((aligned(16))); +__m128i bit_met_m3_m3 __attribute__ ((aligned(16))); +__m128i bit_met_m3_m1 __attribute__ ((aligned(16))); +__m128i bit_met_m3_p1 __attribute__ ((aligned(16))); +__m128i bit_met_m3_p3 __attribute__ ((aligned(16))); +__m128i bit_met_m3_p5 __attribute__ ((aligned(16))); +__m128i bit_met_m3_p7 __attribute__ ((aligned(16))); +__m128i bit_met_m1_m7 __attribute__ ((aligned(16))); +__m128i bit_met_m1_m5 __attribute__ ((aligned(16))); +__m128i bit_met_m1_m3 __attribute__ ((aligned(16))); +__m128i bit_met_m1_m1 __attribute__ ((aligned(16))); +__m128i bit_met_m1_p1 __attribute__ ((aligned(16))); +__m128i bit_met_m1_p3 __attribute__ ((aligned(16))); +__m128i bit_met_m1_p5 __attribute__ ((aligned(16))); +__m128i bit_met_m1_p7 __attribute__ ((aligned(16))); +__m128i bit_met_p1_m7 __attribute__ ((aligned(16))); +__m128i bit_met_p1_m5 __attribute__ ((aligned(16))); +__m128i bit_met_p1_m3 __attribute__ ((aligned(16))); +__m128i bit_met_p1_m1 __attribute__ ((aligned(16))); +__m128i bit_met_p1_p1 __attribute__ ((aligned(16))); +__m128i bit_met_p1_p3 __attribute__ ((aligned(16))); +__m128i bit_met_p1_p5 __attribute__ ((aligned(16))); +__m128i bit_met_p1_p7 __attribute__ ((aligned(16))); +__m128i bit_met_p3_m7 __attribute__ ((aligned(16))); +__m128i bit_met_p3_m5 __attribute__ ((aligned(16))); +__m128i bit_met_p3_m3 __attribute__ ((aligned(16))); +__m128i bit_met_p3_m1 __attribute__ ((aligned(16))); +__m128i bit_met_p3_p1 __attribute__ ((aligned(16))); +__m128i bit_met_p3_p3 __attribute__ ((aligned(16))); +__m128i bit_met_p3_p5 __attribute__ ((aligned(16))); +__m128i bit_met_p3_p7 __attribute__ ((aligned(16))); +__m128i bit_met_p5_m7 __attribute__ ((aligned(16))); +__m128i bit_met_p5_m5 __attribute__ ((aligned(16))); +__m128i bit_met_p5_m3 __attribute__ ((aligned(16))); +__m128i bit_met_p5_m1 __attribute__ ((aligned(16))); +__m128i bit_met_p5_p1 __attribute__ ((aligned(16))); +__m128i bit_met_p5_p3 __attribute__ ((aligned(16))); +__m128i bit_met_p5_p5 __attribute__ ((aligned(16))); +__m128i bit_met_p5_p7 __attribute__ ((aligned(16))); +__m128i bit_met_p7_m7 __attribute__ ((aligned(16))); +__m128i bit_met_p7_m5 __attribute__ ((aligned(16))); +__m128i bit_met_p7_m3 __attribute__ ((aligned(16))); +__m128i bit_met_p7_m1 __attribute__ ((aligned(16))); +__m128i bit_met_p7_p1 __attribute__ ((aligned(16))); +__m128i bit_met_p7_p3 __attribute__ ((aligned(16))); +__m128i bit_met_p7_p5 __attribute__ ((aligned(16))); +__m128i bit_met_p7_p7 __attribute__ ((aligned(16))); + +__m128i y0_p_1_1 __attribute__ ((aligned(16))); +__m128i y0_p_1_3 __attribute__ ((aligned(16))); +__m128i y0_p_1_5 __attribute__ ((aligned(16))); +__m128i y0_p_1_7 __attribute__ ((aligned(16))); +__m128i y0_p_3_1 __attribute__ ((aligned(16))); +__m128i y0_p_3_3 __attribute__ ((aligned(16))); +__m128i y0_p_3_5 __attribute__ ((aligned(16))); +__m128i y0_p_3_7 __attribute__ ((aligned(16))); +__m128i y0_p_5_1 __attribute__ ((aligned(16))); +__m128i y0_p_5_3 __attribute__ ((aligned(16))); +__m128i y0_p_5_5 __attribute__ ((aligned(16))); +__m128i y0_p_5_7 __attribute__ ((aligned(16))); +__m128i y0_p_7_1 __attribute__ ((aligned(16))); +__m128i y0_p_7_3 __attribute__ ((aligned(16))); +__m128i y0_p_7_5 __attribute__ ((aligned(16))); +__m128i y0_p_7_7 __attribute__ ((aligned(16))); +__m128i y0_m_1_1 __attribute__ ((aligned(16))); +__m128i y0_m_1_3 __attribute__ ((aligned(16))); +__m128i y0_m_1_5 __attribute__ ((aligned(16))); +__m128i y0_m_1_7 __attribute__ ((aligned(16))); +__m128i y0_m_3_1 __attribute__ ((aligned(16))); +__m128i y0_m_3_3 __attribute__ ((aligned(16))); +__m128i y0_m_3_5 __attribute__ ((aligned(16))); +__m128i y0_m_3_7 __attribute__ ((aligned(16))); +__m128i y0_m_5_1 __attribute__ ((aligned(16))); +__m128i y0_m_5_3 __attribute__ ((aligned(16))); +__m128i y0_m_5_5 __attribute__ ((aligned(16))); +__m128i y0_m_5_7 __attribute__ ((aligned(16))); +__m128i y0_m_7_1 __attribute__ ((aligned(16))); +__m128i y0_m_7_3 __attribute__ ((aligned(16))); +__m128i y0_m_7_5 __attribute__ ((aligned(16))); +__m128i y0_m_7_7 __attribute__ ((aligned(16))); + +__m128i xmm0 __attribute__ ((aligned(16))); +__m128i xmm1 __attribute__ ((aligned(16))); +__m128i xmm2 __attribute__ ((aligned(16))); +__m128i xmm3 __attribute__ ((aligned(16))); +__m128i xmm4 __attribute__ ((aligned(16))); +__m128i xmm5 __attribute__ ((aligned(16))); +__m128i xmm6 __attribute__ ((aligned(16))); +__m128i xmm7 __attribute__ ((aligned(16))); +__m128i xmm8 __attribute__ ((aligned(16))); + +__m128i y0r __attribute__ ((aligned(16))); +__m128i y0i __attribute__ ((aligned(16))); +__m128i y1r __attribute__ ((aligned(16))); +__m128i y1i __attribute__ ((aligned(16))); +__m128i y2r __attribute__ ((aligned(16))); +__m128i y2i __attribute__ ((aligned(16))); + +__m128i logmax_num_re0 __attribute__ ((aligned(16))); +__m128i logmax_num_im0 __attribute__ ((aligned(16))); +__m128i logmax_den_re0 __attribute__ ((aligned(16))); +__m128i logmax_den_im0 __attribute__ ((aligned(16))); +__m128i logmax_num_re1 __attribute__ ((aligned(16))); +__m128i logmax_num_im1 __attribute__ ((aligned(16))); +__m128i logmax_den_re1 __attribute__ ((aligned(16))); +__m128i logmax_den_im1 __attribute__ ((aligned(16))); + +__m128i tmp_result __attribute__ ((aligned(16))); +__m128i tmp_result2 __attribute__ ((aligned(16))); +__m128i tmp_result3 __attribute__ ((aligned(16))); +__m128i tmp_result4 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC short zero[8] __attribute__ ((aligned(16))) = {0,0,0,0,0,0,0,0}; -NOCYGWIN_STATIC short ones[8] __attribute__ ((aligned(16))) = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff}; -NOCYGWIN_STATIC __m128i rho_rpi __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_1_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_1_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_1_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_1_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_3_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_3_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_3_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_3_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_5_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_5_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_5_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_5_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_7_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_7_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_7_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rpi_7_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_1_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_1_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_1_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_1_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_3_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_3_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_3_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_3_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_5_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_5_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_5_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_5_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_7_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_7_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_7_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i rho_rmi_7_7 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i psi_r_m7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m7_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_m1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_r_p7_p7 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i psi_i_m7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m7_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_m1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_i_p7_p7 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i a_r_m7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m7_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_m1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_r_p7_p7 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i a_i_m7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m7_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_m1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_i_p7_p7 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i psi_a_m7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m7_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_m1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i psi_a_p7_p7 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i a_sq_m7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m7_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_m1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i a_sq_p7_p7 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i bit_met_m7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m7_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_m1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p1_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p1_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p1_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p1_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p1_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p1_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p1_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p1_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p3_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p3_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p3_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p3_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p3_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p3_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p3_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p3_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p5_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p5_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p5_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p5_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p5_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p5_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p5_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p5_p7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p7_m7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p7_m5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p7_m3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p7_m1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p7_p1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p7_p3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p7_p5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i bit_met_p7_p7 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i y0_p_1_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_1_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_1_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_1_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_3_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_3_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_3_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_3_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_5_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_5_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_5_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_5_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_7_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_7_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_7_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_p_7_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_1_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_1_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_1_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_1_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_3_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_3_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_3_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_3_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_5_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_5_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_5_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_5_7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_7_1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_7_3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_7_5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0_m_7_7 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i xmm0 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i xmm1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i xmm2 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i xmm3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i xmm4 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i xmm5 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i xmm6 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i xmm7 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i xmm8 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i y0r __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0i __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y1r __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y1i __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y2r __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y2i __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i logmax_num_re0 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i logmax_num_im0 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i logmax_den_re0 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i logmax_den_im0 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i logmax_num_re1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i logmax_num_im1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i logmax_den_re1 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i logmax_den_im1 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i tmp_result __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i tmp_result2 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i tmp_result3 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i tmp_result4 __attribute__ ((aligned(16))); //============================================================================================== // Auxiliary Makros @@ -625,6 +621,10 @@ NOCYGWIN_STATIC __m128i tmp_result4 __attribute__ ((aligned(16))); // calculates a_sq = int_ch_mag*(a_r^2 + a_i^2)*scale_factor for 64-QAM #define square_a_64qam_epi16(a_r,a_i,int_ch_mag,scale_factor,a_sq) tmp_result = _mm_mulhi_epi16(a_r,a_r); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result = _mm_mulhi_epi16(tmp_result,scale_factor); tmp_result = _mm_slli_epi16(tmp_result,3); tmp_result = _mm_mulhi_epi16(tmp_result,int_ch_mag); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result2 = _mm_mulhi_epi16(a_i,a_i); tmp_result2 = _mm_slli_epi16(tmp_result2,1); tmp_result2 = _mm_mulhi_epi16(tmp_result2,scale_factor); tmp_result2 = _mm_slli_epi16(tmp_result2,3); tmp_result2 = _mm_mulhi_epi16(tmp_result2,int_ch_mag); tmp_result2 = _mm_slli_epi16(tmp_result2,1); a_sq = _mm_adds_epi16(tmp_result,tmp_result2); +#elif defined(__arm__) + +#endif + //============================================================================================== // SINGLE-STREAM //============================================================================================== @@ -634,16 +634,16 @@ NOCYGWIN_STATIC __m128i tmp_result4 __attribute__ ((aligned(16))); //---------------------------------------------------------------------------------------------- int dlsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - short *dlsch_llr, - unsigned char symbol, + int32_t **rxdataF_comp, + int16_t *dlsch_llr, + uint8_t symbol, uint8_t first_symbol_flag, uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr32p) + int16_t **llr32p) { - uint32_t *rxF = (uint32_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; + uint32_t *rxF = (uint32_t*)&rxdataF_comp[0][((int32_t)symbol*frame_parms->N_RB_DL*12)]; uint32_t *llr32; int i,len; uint8_t symbol_mod = (symbol >= (7-frame_parms->Ncp))? (symbol-(7-frame_parms->Ncp)) : symbol; @@ -669,17 +669,14 @@ int dlsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, len = (nb_rb*12);// - pbch_pss_sss_adjust; } - // printf("dlsch_qpsk_llr: symbol %d,nb_rb %d, len %d,pbch_pss_sss_adjust %d\n",symbol,nb_rb,len,pbch_pss_sss_adjust); +// printf("dlsch_qpsk_llr: symbol %d,nb_rb %d, len %d,pbch_pss_sss_adjust %d\n",symbol,nb_rb,len,pbch_pss_sss_adjust); for (i=0; i<len; i++) { *llr32 = *rxF; rxF++; llr32++; } - *llr32p = (short *)llr32; - - _mm_empty(); - _m_empty(); + *llr32p = (int16_t *)llr32; return(0); } @@ -689,34 +686,54 @@ int dlsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, //---------------------------------------------------------------------------------------------- void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - short *dlsch_llr, - int **dl_ch_mag, - unsigned char symbol, + int32_t **rxdataF_comp, + int16_t *dlsch_llr, + int32_t **dl_ch_mag, + uint8_t symbol, uint8_t first_symbol_flag, - unsigned short nb_rb, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, int16_t **llr32p) { +#if defined(__x86_64__) || defined(__i386__) __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag; __m128i llr128[2]; + uint32_t *llr32; +#elif defined(__arm__) + int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; + int16x8_t *ch_mag; + int16x8_t xmm0; + int16_t *llr16; +#endif + + int i,len; unsigned char symbol_mod,len_mod4=0; - uint32_t *llr32; +#if defined(__x86_64__) || defined(__i386__) if (first_symbol_flag==1) { llr32 = (uint32_t*)dlsch_llr; } else { llr32 = (uint32_t*)*llr32p; } +#elif defined(__arm__) + if (first_symbol_flag==1) { + llr16 = (int16_t*)dlsch_llr; + } else { + llr16 = (int16_t*)*llr32p; + } +#endif symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; +#if defined(__x86_64__) || defined(__i386__) ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; - +#elif defined(__arm__) + ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; +#endif if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) { if (frame_parms->mode1_flag==0) len = nb_rb*8 - (2*pbch_pss_sss_adjust/3); @@ -738,6 +755,7 @@ void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, for (i=0; i<len; i++) { +#if defined(__x86_64__) || defined(__i386) xmm0 = _mm_abs_epi16(rxF[i]); xmm0 = _mm_subs_epi16(ch_mag[i],xmm0); @@ -753,10 +771,36 @@ void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, llr32[6] = _mm_extract_epi32(llr128[1],2); //((uint32_t *)&llr128[1])[2]; llr32[7] = _mm_extract_epi32(llr128[1],3); //((uint32_t *)&llr128[1])[3]; llr32+=8; +#elif defined(__arm__) + xmm0 = vabsq_s16(rxF[i]); + xmm0 = vqsubq_s16(ch_mag[i],xmm0); + // lambda_1=y_R, lambda_2=|y_R|-|h|^2, lamda_3=y_I, lambda_4=|y_I|-|h|^2 + + llr16[0] = vgetq_lane_s16(rxF[i],0); + llr16[1] = vgetq_lane_s16(rxF[i],1); + llr16[2] = vgetq_lane_s16(xmm0,0); + llr16[3] = vgetq_lane_s16(xmm0,1); + llr16[4] = vgetq_lane_s16(rxF[i],2); + llr16[5] = vgetq_lane_s16(rxF[i],3); + llr16[6] = vgetq_lane_s16(xmm0,2); + llr16[7] = vgetq_lane_s16(xmm0,3); + llr16[8] = vgetq_lane_s16(rxF[i],4); + llr16[9] = vgetq_lane_s16(rxF[i],5); + llr16[10] = vgetq_lane_s16(xmm0,4); + llr16[11] = vgetq_lane_s16(xmm0,5); + llr16[12] = vgetq_lane_s16(rxF[i],6); + llr16[13] = vgetq_lane_s16(rxF[i],6); + llr16[14] = vgetq_lane_s16(xmm0,7); + llr16[15] = vgetq_lane_s16(xmm0,7); + llr16+=16; +#endif + } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } //---------------------------------------------------------------------------------------------- @@ -764,19 +808,23 @@ void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, //---------------------------------------------------------------------------------------------- void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - short *dlsch_llr, - int **dl_ch_mag, - int **dl_ch_magb, - unsigned char symbol, + int32_t **rxdataF_comp, + int16_t *dlsch_llr, + int32_t **dl_ch_mag, + int32_t **dl_ch_magb, + uint8_t symbol, uint8_t first_symbol_flag, - unsigned short nb_rb, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr_save) + int16_t **llr_save) { - +#if defined(__x86_64__) || defined(__i386__) __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag,*ch_magb; +#elif defined(__arm__) + int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; + int16x8_t *ch_mag,*ch_magb,xmm1,xmm2; +#endif int i,len,len2; unsigned char symbol_mod,len_mod4; short *llr; @@ -789,9 +837,13 @@ void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; +#if defined(__x86_64__) || defined(__i386__) ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; ch_magb = (__m128i*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)]; - +#elif defined(__arm__) + ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; + ch_magb = (int16x8_t*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)]; +#endif if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) { if (frame_parms->mode1_flag==0) len = nb_rb*8 - (2*pbch_pss_sss_adjust/3); @@ -810,62 +862,96 @@ void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, for (i=0; i<len2; i++) { +#if defined(__x86_64__) || defined(__i386__) xmm1 = _mm_abs_epi16(rxF[i]); xmm1 = _mm_subs_epi16(ch_mag[i],xmm1); xmm2 = _mm_abs_epi16(xmm1); xmm2 = _mm_subs_epi16(ch_magb[i],xmm2); - +#elif defined(__arm__) + xmm1 = vabsq_s16(rxF[i]); + xmm1 = vsubq_s16(ch_mag[i],xmm1); + xmm2 = vabsq_s16(xmm1); + xmm2 = vsubq_s16(ch_magb[i],xmm2); +#endif // loop over all LLRs in quad word (24 coded bits) /* - for (j=0;j<8;j+=2) { - llr2[0] = ((short *)&rxF[i])[j]; - llr2[1] = ((short *)&rxF[i])[j+1]; - llr2[2] = ((short *)&xmm1)[j]; - llr2[3] = ((short *)&xmm1)[j+1]; - llr2[4] = ((short *)&xmm2)[j]; - llr2[5] = ((short *)&xmm2)[j+1]; - - llr2+=6; - } + for (j=0;j<8;j+=2) { + llr2[0] = ((short *)&rxF[i])[j]; + llr2[1] = ((short *)&rxF[i])[j+1]; + llr2[2] = ((short *)&xmm1)[j]; + llr2[3] = ((short *)&xmm1)[j+1]; + llr2[4] = ((short *)&xmm2)[j]; + llr2[5] = ((short *)&xmm2)[j+1]; + + llr2+=6; + } */ llr2[0] = ((short *)&rxF[i])[0]; llr2[1] = ((short *)&rxF[i])[1]; +#if defined(__x86_64__) || defined(__i386__) llr2[2] = _mm_extract_epi16(xmm1,0); llr2[3] = _mm_extract_epi16(xmm1,1);//((short *)&xmm1)[j+1]; llr2[4] = _mm_extract_epi16(xmm2,0);//((short *)&xmm2)[j]; llr2[5] = _mm_extract_epi16(xmm2,1);//((short *)&xmm2)[j+1]; +#elif defined(__arm__) + llr2[2] = vgetq_lane_s16(xmm1,0); + llr2[3] = vgetq_lane_s16(xmm1,1);//((short *)&xmm1)[j+1]; + llr2[4] = vgetq_lane_s16(xmm2,0);//((short *)&xmm2)[j]; + llr2[5] = vgetq_lane_s16(xmm2,1);//((short *)&xmm2)[j+1]; +#endif llr2+=6; llr2[0] = ((short *)&rxF[i])[2]; llr2[1] = ((short *)&rxF[i])[3]; +#if defined(__x86_64__) || defined(__i386__) llr2[2] = _mm_extract_epi16(xmm1,2); llr2[3] = _mm_extract_epi16(xmm1,3);//((short *)&xmm1)[j+1]; llr2[4] = _mm_extract_epi16(xmm2,2);//((short *)&xmm2)[j]; llr2[5] = _mm_extract_epi16(xmm2,3);//((short *)&xmm2)[j+1]; +#elif defined(__arm__) + llr2[2] = vgetq_lane_s16(xmm1,2); + llr2[3] = vgetq_lane_s16(xmm1,3);//((short *)&xmm1)[j+1]; + llr2[4] = vgetq_lane_s16(xmm2,2);//((short *)&xmm2)[j]; + llr2[5] = vgetq_lane_s16(xmm2,3);//((short *)&xmm2)[j+1]; +#endif llr2+=6; llr2[0] = ((short *)&rxF[i])[4]; llr2[1] = ((short *)&rxF[i])[5]; +#if defined(__x86_64__) || defined(__i386__) llr2[2] = _mm_extract_epi16(xmm1,4); llr2[3] = _mm_extract_epi16(xmm1,5);//((short *)&xmm1)[j+1]; llr2[4] = _mm_extract_epi16(xmm2,4);//((short *)&xmm2)[j]; llr2[5] = _mm_extract_epi16(xmm2,5);//((short *)&xmm2)[j+1]; - +#elif defined(__arm__) + llr2[2] = vgetq_lane_s16(xmm1,4); + llr2[3] = vgetq_lane_s16(xmm1,5);//((short *)&xmm1)[j+1]; + llr2[4] = vgetq_lane_s16(xmm2,4);//((short *)&xmm2)[j]; + llr2[5] = vgetq_lane_s16(xmm2,5);//((short *)&xmm2)[j+1]; +#endif llr2+=6; llr2[0] = ((short *)&rxF[i])[6]; llr2[1] = ((short *)&rxF[i])[7]; +#if defined(__x86_64__) || defined(__i386__) llr2[2] = _mm_extract_epi16(xmm1,6); llr2[3] = _mm_extract_epi16(xmm1,7);//((short *)&xmm1)[j+1]; llr2[4] = _mm_extract_epi16(xmm2,6);//((short *)&xmm2)[j]; llr2[5] = _mm_extract_epi16(xmm2,7);//((short *)&xmm2)[j+1]; - +#elif defined(__arm__) + llr2[2] = vgetq_lane_s16(xmm1,6); + llr2[3] = vgetq_lane_s16(xmm1,7);//((short *)&xmm1)[j+1]; + llr2[4] = vgetq_lane_s16(xmm2,6);//((short *)&xmm2)[j]; + llr2[5] = vgetq_lane_s16(xmm2,7);//((short *)&xmm2)[j+1]; +#endif llr2+=6; } *llr_save = llr; +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } @@ -877,20 +963,22 @@ void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, // QPSK //---------------------------------------------------------------------------------------------- -NOCYGWIN_STATIC __m128i y0r_over2 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0i_over2 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y1r_over2 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y1i_over2 __attribute__ ((aligned(16))); - -NOCYGWIN_STATIC __m128i A __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i B __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i C __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i D __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i E __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i F __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i G __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i H __attribute__ ((aligned(16))); +#if defined(__x86_64__) || defined(__i386) +__m128i y0r_over2 __attribute__ ((aligned(16))); +__m128i y0i_over2 __attribute__ ((aligned(16))); +__m128i y1r_over2 __attribute__ ((aligned(16))); +__m128i y1i_over2 __attribute__ ((aligned(16))); + +__m128i A __attribute__ ((aligned(16))); +__m128i B __attribute__ ((aligned(16))); +__m128i C __attribute__ ((aligned(16))); +__m128i D __attribute__ ((aligned(16))); +__m128i E __attribute__ ((aligned(16))); +__m128i F __attribute__ ((aligned(16))); +__m128i G __attribute__ ((aligned(16))); +__m128i H __attribute__ ((aligned(16))); +#endif int dlsch_qpsk_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, @@ -948,47 +1036,53 @@ int dlsch_qpsk_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, return(0); } -NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_8 __attribute__((aligned(16))); +//__m128i ONE_OVER_SQRT_8 __attribute__((aligned(16))); void qpsk_qpsk(short *stream0_in, short *stream1_in, short *stream0_out, short *rho01, int length - ) + ) { /* - This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers. - - Parameters: - stream0_in = Matched filter output y0' = (h0*g0)*y0 - stream1_in = Matched filter output y1' = (h0*g1)*y0 - stream0_out = LLRs - rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0) - length = number of resource elements + This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers. + + Parameters: + stream0_in = Matched filter output y0' = (h0*g0)*y0 + stream1_in = Matched filter output y1' = (h0*g1)*y0 + stream0_out = LLRs + rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0) + length = number of resource elements */ +#if defined(__x86_64__) || defined(__i386__) __m128i *rho01_128i = (__m128i *)rho01; __m128i *stream0_128i_in = (__m128i *)stream0_in; __m128i *stream1_128i_in = (__m128i *)stream1_in; __m128i *stream0_128i_out = (__m128i *)stream0_out; - -#ifdef DEBUG_LLR - print_shorts2("rho01_128i:\n",rho01_128i); + __m128i ONE_OVER_SQRT_8 = _mm_set1_epi16(23170); //round(2^16/sqrt(8)) +#elif defined(__arm__) + int16x8_t *rho01_128i = (int16x8_t *)rho01; + int16x8_t *stream0_128i_in = (int16x8_t *)stream0_in; + int16x8_t *stream1_128i_in = (int16x8_t *)stream1_in; + int16x8_t *stream0_128i_out = (int16x8_t *)stream0_out; + int16x8_t ONE_OVER_SQRT_8 = vdupq_n_s16(23170); //round(2^16/sqrt(8)) #endif int i; - ONE_OVER_SQRT_8 = _mm_set1_epi16(23170); //round(2^16/sqrt(8)) + for (i=0; i<length>>2; i+=2) { // in each iteration, we take 8 complex samples - +#if defined(__x86_64__) || defined(__i386__) xmm0 = rho01_128i[i]; // 4 symbols xmm1 = rho01_128i[i+1]; // put (rho_r + rho_i)/2sqrt2 in rho_rpi // put (rho_r - rho_i)/2sqrt2 in rho_rmi + xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3)); xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3)); xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3)); @@ -1005,10 +1099,14 @@ void qpsk_qpsk(short *stream0_in, // divide by sqrt(8), no shift needed ONE_OVER_SQRT_8 = Q1.16 rho_rpi = _mm_mulhi_epi16(rho_rpi,ONE_OVER_SQRT_8); rho_rmi = _mm_mulhi_epi16(rho_rmi,ONE_OVER_SQRT_8); +#elif defined(__arm__) + +#endif // Compute LLR for first bit of stream 0 // Compute real and imaginary parts of MF output for stream 0 +#if defined(__x86_64__) || defined(__i386__) xmm0 = stream0_128i_in[i]; xmm1 = stream0_128i_in[i+1]; @@ -1025,8 +1123,12 @@ void qpsk_qpsk(short *stream0_in, y0r_over2 = _mm_srai_epi16(y0r,1); // divide by 2 y0i_over2 = _mm_srai_epi16(y0i,1); // divide by 2 +#elif defined(__arm__) + +#endif // Compute real and imaginary parts of MF output for stream 1 +#if defined(__x86_64__) || defined(__i386__) xmm0 = stream1_128i_in[i]; xmm1 = stream1_128i_in[i+1]; @@ -1116,23 +1218,28 @@ void qpsk_qpsk(short *stream0_in, if (i<((length>>1) - 1)) // false if only 2 REs remain _mm_storeu_si128(&stream0_128i_out[i+1],_mm_unpackhi_epi16(y0r,y0i)); +#elif defined(__x86_64__) + +#endif } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } int dlsch_qpsk_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - int **rxdataF_comp_i, - int **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10}) - int **rho_i, - short *dlsch_llr, - unsigned char symbol, - unsigned char first_symbol_flag, - unsigned short nb_rb, + int32_t **rxdataF_comp, + int32_t **rxdataF_comp_i, + int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10}) + int32_t **rho_i, + int16_t *dlsch_llr, + uint8_t symbol, + uint8_t first_symbol_flag, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr16p) + int16_t **llr16p) { int16_t *rxF=(int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; @@ -1180,38 +1287,58 @@ int dlsch_qpsk_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, return(0); } -NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_10 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i THREE_OVER_SQRT_10 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_10_Q15 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i SQRT_10_OVER_FOUR __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_int __attribute__((aligned(16))); - -void qpsk_qam16(short *stream0_in, - short *stream1_in, - short *ch_mag_i, - short *stream0_out, - short *rho01, - int length - ) +/* +#if defined(__x86_64__) || defined(__i386__) +__m128i ONE_OVER_SQRT_2 __attribute__((aligned(16))); +__m128i ONE_OVER_SQRT_10 __attribute__((aligned(16))); +__m128i THREE_OVER_SQRT_10 __attribute__((aligned(16))); +__m128i ONE_OVER_SQRT_10_Q15 __attribute__((aligned(16))); +__m128i SQRT_10_OVER_FOUR __attribute__((aligned(16))); +__m128i ch_mag_int; +#endif +*/ +void qpsk_qam16(int16_t *stream0_in, + int16_t *stream1_in, + int16_t *ch_mag_i, + int16_t *stream0_out, + int16_t *rho01, + int32_t length + ) { - /* - This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers. - - Parameters: - stream0_in = Matched filter output y0' = (h0*g0)*y0 - stream1_in = Matched filter output y1' = (h0*g1)*y0 - stream0_out = LLRs - rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0) - length = number of resource elements + This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers. + + Parameters: + stream0_in = Matched filter output y0' = (h0*g0)*y0 + stream1_in = Matched filter output y1' = (h0*g1)*y0 + stream0_out = LLRs + rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0) + length = number of resource elements */ +#if defined(__x86_64__) || defined(__i386__) __m128i *rho01_128i = (__m128i *)rho01; __m128i *stream0_128i_in = (__m128i *)stream0_in; __m128i *stream1_128i_in = (__m128i *)stream1_in; __m128i *stream0_128i_out = (__m128i *)stream0_out; __m128i *ch_mag_128i_i = (__m128i *)ch_mag_i; + __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) + __m128i ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15) + __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) + __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) + __m128i ch_mag_int __attribute__((aligned(16))); +#elif defined(__arm__) + int16x8_t *rho01_128i = (int16x8_t *)rho01; + int16x8_t *stream0_128i_in = (int16x8_t *)stream0_in; + int16x8_t *stream1_128i_in = (int16x8_t *)stream1_in; + int16x8_t *stream0_128i_out = (int16x8_t *)stream0_out; + int16x8_t *ch_mag_128i_i = (int16x8_t *)ch_mag_i; + int16x8_t ONE_OVER_SQRT_2 = vdupq_n_s16(23170); // round(1/sqrt(2)*2^15) + int16x8_t ONE_OVER_SQRT_10_Q15 = vdupq_n_s16(10362); // round(1/sqrt(10)*2^15) + int16x8_t THREE_OVER_SQRT_10 = vdupq_n_s16(31086); // round(3/sqrt(10)*2^15) + int16x8_t SQRT_10_OVER_FOUR = vdupq_n_s16(25905); // round(sqrt(10)/4*2^15) + int16x8_t ch_mag_int __attribute__((aligned(16))); +#endif #ifdef DEBUG_LLR print_shorts2("rho01_128i:\n",rho01_128i); @@ -1219,14 +1346,12 @@ void qpsk_qam16(short *stream0_in, int i; - ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) - ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15) - THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) - SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) for (i=0; i<length>>2; i+=2) { // in each iteration, we take 8 complex samples +#if defined(__x86_64__) || defined(__i386__) + xmm0 = rho01_128i[i]; // 4 symbols xmm1 = rho01_128i[i+1]; @@ -1377,23 +1502,28 @@ void qpsk_qam16(short *stream0_in, if (i<((length>>1) - 1)) // false if only 2 REs remain stream0_128i_out[i+1] = _mm_unpackhi_epi16(y0r,y0i); +#elif defined(__arm__) + +#endif } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } int dlsch_qpsk_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - int **rxdataF_comp_i, - int **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10}) - int **rho_i, - short *dlsch_llr, - unsigned char symbol, - unsigned char first_symbol_flag, - unsigned short nb_rb, + int32_t **rxdataF_comp, + int32_t **rxdataF_comp_i, + int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10}) + int32_t **rho_i, + int16_t *dlsch_llr, + uint8_t symbol, + uint8_t first_symbol_flag, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr16p) + int16_t **llr16p) { int16_t *rxF=(int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; @@ -1440,42 +1570,56 @@ int dlsch_qpsk_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, return(0); } - -NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_2_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i THREE_OVER_SQRT_2_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i FIVE_OVER_SQRT_2_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i SEVEN_OVER_SQRT_2_42 __attribute__((aligned(16))); - -NOCYGWIN_STATIC __m128i ch_mag_int_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i two_ch_mag_int_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i three_ch_mag_int_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i SQRT_42_OVER_FOUR __attribute__((aligned(16))); - +/* +__m128i ONE_OVER_SQRT_2_42 __attribute__((aligned(16))); +__m128i THREE_OVER_SQRT_2_42 __attribute__((aligned(16))); +__m128i FIVE_OVER_SQRT_2_42 __attribute__((aligned(16))); +__m128i SEVEN_OVER_SQRT_2_42 __attribute__((aligned(16))); + +__m128i ch_mag_int_with_sigma2 __attribute__((aligned(16))); +__m128i two_ch_mag_int_with_sigma2 __attribute__((aligned(16))); +__m128i three_ch_mag_int_with_sigma2 __attribute__((aligned(16))); +__m128i SQRT_42_OVER_FOUR __attribute__((aligned(16))); +*/ void qpsk_qam64(short *stream0_in, short *stream1_in, short *ch_mag_i, short *stream0_out, short *rho01, int length - ) + ) { /* - This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers. - - Parameters: - stream0_in = Matched filter output y0' = (h0*g0)*y0 - stream1_in = Matched filter output y1' = (h0*g1)*y0 - stream0_out = LLRs - rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0) - length = number of resource elements + This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers. + + Parameters: + stream0_in = Matched filter output y0' = (h0*g0)*y0 + stream1_in = Matched filter output y1' = (h0*g1)*y0 + stream0_out = LLRs + rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0) + length = number of resource elements */ +#if defined(__x86_64__) || defined(__i386__) __m128i *rho01_128i = (__m128i *)rho01; __m128i *stream0_128i_in = (__m128i *)stream0_in; __m128i *stream1_128i_in = (__m128i *)stream1_in; __m128i *stream0_128i_out = (__m128i *)stream0_out; __m128i *ch_mag_128i_i = (__m128i *)ch_mag_i; + __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) + __m128i ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15) + __m128i THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15) + __m128i FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15) + __m128i SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15) + __m128i SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.1 + __m128i ch_mag_int; + __m128i ch_mag_int_with_sigma2; + __m128i two_ch_mag_int_with_sigma2; + __m128i three_ch_mag_int_with_sigma2; +#elif defined(__arm__) + +#endif #ifdef DEBUG_LLR print_shorts2("rho01_128i:\n",rho01_128i); @@ -1483,16 +1627,12 @@ void qpsk_qam64(short *stream0_in, int i; - ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) - ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15) - THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15) - FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15) - SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15) - SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12 for (i=0; i<length>>2; i+=2) { // in each iteration, we take 8 complex samples +#if defined(__x86_64__) || defined(__i386__) + xmm0 = rho01_128i[i]; // 4 symbols xmm1 = rho01_128i[i+1]; @@ -1662,10 +1802,15 @@ void qpsk_qam64(short *stream0_in, if (i<((length>>1) - 1)) // false if only 2 REs remain stream0_128i_out[i+1] = _mm_unpackhi_epi16(y0r,y0i); +#elif defined(__arm__) + +#endif } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } @@ -1673,18 +1818,20 @@ void qpsk_qam64(short *stream0_in, // 16-QAM //---------------------------------------------------------------------------------------------- -NOCYGWIN_STATIC __m128i ONE_OVER_TWO_SQRT_10 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i NINE_OVER_TWO_SQRT_10 __attribute__((aligned(16))); +/* +__m128i ONE_OVER_TWO_SQRT_10 __attribute__((aligned(16))); +__m128i NINE_OVER_TWO_SQRT_10 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i y0r_over_sqrt10 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0i_over_sqrt10 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0r_three_over_sqrt10 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i y0i_three_over_sqrt10 __attribute__ ((aligned(16))); +__m128i y0r_over_sqrt10 __attribute__ ((aligned(16))); +__m128i y0i_over_sqrt10 __attribute__ ((aligned(16))); +__m128i y0r_three_over_sqrt10 __attribute__ ((aligned(16))); +__m128i y0i_three_over_sqrt10 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_des __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_over_10 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_over_2 __attribute__ ((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_9_over_10 __attribute__ ((aligned(16))); +__m128i ch_mag_des __attribute__((aligned(16))); +__m128i ch_mag_over_10 __attribute__ ((aligned(16))); +__m128i ch_mag_over_2 __attribute__ ((aligned(16))); +__m128i ch_mag_9_over_10 __attribute__ ((aligned(16))); +*/ void qam16_qpsk(short *stream0_in, short *stream1_in, @@ -1692,42 +1839,56 @@ void qam16_qpsk(short *stream0_in, short *stream0_out, short *rho01, int length - ) + ) { /* - Author: Sebastian Wagner - Date: 2012-06-04 - - Input: - stream0_in: MF filter for 1st stream, i.e., y0=h0'*y - stream!_in: MF filter for 2nd stream, i.e., y1=h1'*y - ch_mag: 2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - ch_mag_i: 2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - rho01: Channel cross correlation, i.e., h1'*h0 - - Output: - stream0_out: output LLRs for 1st stream + Author: Sebastian Wagner + Date: 2012-06-04 + + Input: + stream0_in: MF filter for 1st stream, i.e., y0=h0'*y + stream!_in: MF filter for 2nd stream, i.e., y1=h1'*y + ch_mag: 2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + ch_mag_i: 2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + rho01: Channel cross correlation, i.e., h1'*h0 + + Output: + stream0_out: output LLRs for 1st stream */ +#if defined(__x86_64__) || defined(__i386__) __m128i *rho01_128i = (__m128i *)rho01; __m128i *stream0_128i_in = (__m128i *)stream0_in; __m128i *stream1_128i_in = (__m128i *)stream1_in; __m128i *stream0_128i_out = (__m128i *)stream0_out; __m128i *ch_mag_128i = (__m128i *)ch_mag; + __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) + __m128i ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16) + __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) + __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) + __m128i ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16) + __m128i NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14) + __m128i y0r_over_sqrt10; + __m128i y0i_over_sqrt10; + __m128i y0r_three_over_sqrt10; + __m128i y0i_three_over_sqrt10; + + __m128i ch_mag_des; + __m128i ch_mag_over_10; + __m128i ch_mag_over_2; + __m128i ch_mag_9_over_10; +#elif defined(__arm__) + +#endif int i; - ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) - ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16) - THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) - SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) - ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16) - NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14) for (i=0; i<length>>2; i+=2) { // In one iteration, we deal with 8 REs +#if defined(__x86_64__) || defined(__i386__) // Get rho xmm0 = rho01_128i[i]; xmm1 = rho01_128i[i+1]; @@ -2107,23 +2268,30 @@ void qam16_qpsk(short *stream0_in, stream0_128i_out[2*i+1] = _mm_unpackhi_epi32(xmm0,xmm2); stream0_128i_out[2*i+2] = _mm_unpacklo_epi32(xmm1,xmm3); stream0_128i_out[2*i+3] = _mm_unpackhi_epi32(xmm1,xmm3); + +#elif defined(__arm__) + +#endif } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif + } int dlsch_16qam_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - int **rxdataF_comp_i, - int **dl_ch_mag, //|h_0|^2*(2/sqrt{10}) - int **rho_i, - short *dlsch_llr, - unsigned char symbol, - unsigned char first_symbol_flag, - unsigned short nb_rb, + int32_t **rxdataF_comp, + int32_t **rxdataF_comp_i, + int32_t **dl_ch_mag, //|h_0|^2*(2/sqrt{10}) + int32_t **rho_i, + int16_t *dlsch_llr, + uint8_t symbol, + uint8_t first_symbol_flag, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr16p) + int16_t **llr16p) { int16_t *rxF = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; @@ -2182,23 +2350,24 @@ void qam16_qam16(short *stream0_in, short *stream0_out, short *rho01, int length - ) + ) { /* - Author: Sebastian Wagner - Date: 2012-06-04 - - Input: - stream0_in: MF filter for 1st stream, i.e., y0=h0'*y - stream!_in: MF filter for 2nd stream, i.e., y1=h1'*y - ch_mag: 2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - ch_mag_i: 2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - rho01: Channel cross correlation, i.e., h1'*h0 - - Output: - stream0_out: output LLRs for 1st stream + Author: Sebastian Wagner + Date: 2012-06-04 + + Input: + stream0_in: MF filter for 1st stream, i.e., y0=h0'*y + stream!_in: MF filter for 2nd stream, i.e., y1=h1'*y + ch_mag: 2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + ch_mag_i: 2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + rho01: Channel cross correlation, i.e., h1'*h0 + + Output: + stream0_out: output LLRs for 1st stream */ +#if defined(__x86_64__) || defined(__i386__) __m128i *rho01_128i = (__m128i *)rho01; __m128i *stream0_128i_in = (__m128i *)stream0_in; __m128i *stream1_128i_in = (__m128i *)stream1_in; @@ -2206,18 +2375,32 @@ void qam16_qam16(short *stream0_in, __m128i *ch_mag_128i = (__m128i *)ch_mag; __m128i *ch_mag_128i_i = (__m128i *)ch_mag_i; - int i; - ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16) - ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15) - THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) - SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) - ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16) - NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14) + + __m128i ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16) + __m128i ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15) + __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) + __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) + __m128i ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16) + __m128i NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14) + __m128i ch_mag_des,ch_mag_int; + __m128i y0r_over_sqrt10; + __m128i y0i_over_sqrt10; + __m128i y0r_three_over_sqrt10; + __m128i y0i_three_over_sqrt10; + __m128i ch_mag_over_10; + __m128i ch_mag_over_2; + __m128i ch_mag_9_over_10; +#elif defined(__arm__) + +#endif + + int i; for (i=0; i<length>>2; i+=2) { // In one iteration, we deal with 8 REs +#if defined(__x86_64__) || defined(__i386__) // Get rho xmm0 = rho01_128i[i]; xmm1 = rho01_128i[i+1]; @@ -2642,24 +2825,30 @@ void qam16_qam16(short *stream0_in, stream0_128i_out[2*i+1] = _mm_unpackhi_epi32(xmm0,xmm2); stream0_128i_out[2*i+2] = _mm_unpacklo_epi32(xmm1,xmm3); stream0_128i_out[2*i+3] = _mm_unpackhi_epi32(xmm1,xmm3); +#elif defined(__arm__) + +#endif + } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } int dlsch_16qam_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - int **rxdataF_comp_i, - int **dl_ch_mag, //|h_0|^2*(2/sqrt{10}) - int **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10}) - int **rho_i, - short *dlsch_llr, - unsigned char symbol, - unsigned char first_symbol_flag, - unsigned short nb_rb, + int32_t **rxdataF_comp, + int32_t **rxdataF_comp_i, + int32_t **dl_ch_mag, //|h_0|^2*(2/sqrt{10}) + int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10}) + int32_t **rho_i, + int16_t *dlsch_llr, + uint8_t symbol, + uint8_t first_symbol_flag, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr16p) + int16_t **llr16p) { int16_t *rxF = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; @@ -2713,30 +2902,32 @@ int dlsch_16qam_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, return(0); } -void qam16_qam64(short *stream0_in, - short *stream1_in, - short *ch_mag, - short *ch_mag_i, - short *stream0_out, - short *rho01, - int length - ) +void qam16_qam64(int16_t *stream0_in, + int16_t *stream1_in, + int16_t *ch_mag, + int16_t *ch_mag_i, + int16_t *stream0_out, + int16_t *rho01, + int32_t length + ) { /* - Author: Sebastian Wagner - Date: 2012-06-04 - - Input: - stream0_in: MF filter for 1st stream, i.e., y0=h0'*y - stream!_in: MF filter for 2nd stream, i.e., y1=h1'*y - ch_mag: 2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - ch_mag_i: 2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - rho01: Channel cross correlation, i.e., h1'*h0 - - Output: - stream0_out: output LLRs for 1st stream + Author: Sebastian Wagner + Date: 2012-06-04 + + Input: + stream0_in: MF filter for 1st stream, i.e., y0=h0'*y + stream!_in: MF filter for 2nd stream, i.e., y1=h1'*y + ch_mag: 2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + ch_mag_i: 2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + rho01: Channel cross correlation, i.e., h1'*h0 + + Output: + stream0_out: output LLRs for 1st stream */ + +#if defined(__x86_64__) || defined(__i386__) __m128i *rho01_128i = (__m128i *)rho01; __m128i *stream0_128i_in = (__m128i *)stream0_in; __m128i *stream1_128i_in = (__m128i *)stream1_in; @@ -2744,24 +2935,39 @@ void qam16_qam64(short *stream0_in, __m128i *ch_mag_128i = (__m128i *)ch_mag; __m128i *ch_mag_128i_i = (__m128i *)ch_mag_i; - int i; + + __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) + __m128i ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16) + __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) + __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) + __m128i ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16) + __m128i NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14) + __m128i ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15) + __m128i THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15) + __m128i FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15) + __m128i SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15) + __m128i SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3. + __m128i ch_mag_des,ch_mag_int; + __m128i y0r_over_sqrt10; + __m128i y0i_over_sqrt10; + __m128i y0r_three_over_sqrt10; + __m128i y0i_three_over_sqrt10; + __m128i ch_mag_over_10; + __m128i ch_mag_over_2; + __m128i ch_mag_9_over_10; + __m128i ch_mag_int_with_sigma2; + __m128i two_ch_mag_int_with_sigma2; + __m128i three_ch_mag_int_with_sigma2; + +#elif defined(__arm__) - ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) - ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16) - ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15) - THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) - SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) - ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16) - NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14) - ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15) - THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15) - FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15) - SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15) - SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12 +#endif + int i; for (i=0; i<length>>2; i+=2) { // In one iteration, we deal with 8 REs +#if defined(__x86_64__) || defined(__i386__) // Get rho xmm0 = rho01_128i[i]; xmm1 = rho01_128i[i+1]; @@ -3255,24 +3461,30 @@ void qam16_qam64(short *stream0_in, stream0_128i_out[2*i+1] = _mm_unpackhi_epi32(xmm0,xmm2); stream0_128i_out[2*i+2] = _mm_unpacklo_epi32(xmm1,xmm3); stream0_128i_out[2*i+3] = _mm_unpackhi_epi32(xmm1,xmm3); +#elif defined(__arm__) + +#endif + } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } int dlsch_16qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - int **rxdataF_comp_i, - int **dl_ch_mag, //|h_0|^2*(2/sqrt{10}) - int **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10}) - int **rho_i, - short *dlsch_llr, - unsigned char symbol, - unsigned char first_symbol_flag, - unsigned short nb_rb, + int32_t **rxdataF_comp, + int32_t **rxdataF_comp_i, + int32_t **dl_ch_mag, //|h_0|^2*(2/sqrt{10}) + int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10}) + int32_t **rho_i, + int16_t *dlsch_llr, + uint8_t symbol, + uint8_t first_symbol_flag, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr16p) + int16_t **llr16p) { int16_t *rxF = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; @@ -3330,93 +3542,117 @@ int dlsch_16qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, // 64-QAM //---------------------------------------------------------------------------------------------- -NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i THREE_OVER_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i FIVE_OVER_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i SEVEN_OVER_SQRT_42 __attribute__((aligned(16))); - -NOCYGWIN_STATIC __m128i FORTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i TWENTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i SEVENTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i NINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i THIRTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i FIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ONE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); - -NOCYGWIN_STATIC __m128i y0r_one_over_sqrt_21 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i y0r_three_over_sqrt_21 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i y0r_five_over_sqrt_21 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i y0r_seven_over_sqrt_21 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i y0i_one_over_sqrt_21 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i y0i_three_over_sqrt_21 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i y0i_five_over_sqrt_21 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i y0i_seven_over_sqrt_21 __attribute__((aligned(16))); - -NOCYGWIN_STATIC __m128i ch_mag_98_over_42_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_74_over_42_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_58_over_42_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_50_over_42_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_34_over_42_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_18_over_42_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_26_over_42_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_10_over_42_with_sigma2 __attribute__((aligned(16))); -NOCYGWIN_STATIC __m128i ch_mag_2_over_42_with_sigma2 __attribute__((aligned(16))); - -void qam64_qpsk(short *stream0_in, - short *stream1_in, - short *ch_mag, - short *stream0_out, - short *rho01, - int length - ) +/* +__m128i ONE_OVER_SQRT_42 __attribute__((aligned(16))); +__m128i THREE_OVER_SQRT_42 __attribute__((aligned(16))); +__m128i FIVE_OVER_SQRT_42 __attribute__((aligned(16))); +__m128i SEVEN_OVER_SQRT_42 __attribute__((aligned(16))); + +__m128i FORTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); +__m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); +__m128i TWENTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); +__m128i TWENTYFIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); +__m128i SEVENTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); +__m128i NINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); +__m128i THIRTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); +__m128i FIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); +__m128i ONE_OVER_FOUR_SQRT_42 __attribute__((aligned(16))); + +__m128i y0r_one_over_sqrt_21 __attribute__((aligned(16))); +__m128i y0r_three_over_sqrt_21 __attribute__((aligned(16))); +__m128i y0r_five_over_sqrt_21 __attribute__((aligned(16))); +__m128i y0r_seven_over_sqrt_21 __attribute__((aligned(16))); +__m128i y0i_one_over_sqrt_21 __attribute__((aligned(16))); +__m128i y0i_three_over_sqrt_21 __attribute__((aligned(16))); +__m128i y0i_five_over_sqrt_21 __attribute__((aligned(16))); +__m128i y0i_seven_over_sqrt_21 __attribute__((aligned(16))); + +__m128i ch_mag_98_over_42_with_sigma2 __attribute__((aligned(16))); +__m128i ch_mag_74_over_42_with_sigma2 __attribute__((aligned(16))); +__m128i ch_mag_58_over_42_with_sigma2 __attribute__((aligned(16))); +__m128i ch_mag_50_over_42_with_sigma2 __attribute__((aligned(16))); +__m128i ch_mag_34_over_42_with_sigma2 __attribute__((aligned(16))); +__m128i ch_mag_18_over_42_with_sigma2 __attribute__((aligned(16))); +__m128i ch_mag_26_over_42_with_sigma2 __attribute__((aligned(16))); +__m128i ch_mag_10_over_42_with_sigma2 __attribute__((aligned(16))); +__m128i ch_mag_2_over_42_with_sigma2 __attribute__((aligned(16))); + +*/ + +void qam64_qpsk(int16_t *stream0_in, + int16_t *stream1_in, + int16_t *ch_mag, + int16_t *stream0_out, + int16_t *rho01, + int32_t length + ) { /* - Author: S. Wagner - Date: 31-07-12 - - Input: - stream0_in: MF filter for 1st stream, i.e., y0=h0'*y - stream1_in: MF filter for 2nd stream, i.e., y1=h1'*y - ch_mag: 4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - ch_mag_i: 4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - rho01: Channel cross correlation, i.e., h1'*h0 - - Output: - stream0_out: output LLRs for 1st stream + Author: S. Wagner + Date: 31-07-12 + + Input: + stream0_in: MF filter for 1st stream, i.e., y0=h0'*y + stream1_in: MF filter for 2nd stream, i.e., y1=h1'*y + ch_mag: 4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + ch_mag_i: 4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + rho01: Channel cross correlation, i.e., h1'*h0 + + Output: + stream0_out: output LLRs for 1st stream */ +#if defined(__x86_64__) || defined(__i386__) __m128i *rho01_128i = (__m128i *)rho01; __m128i *stream0_128i_in = (__m128i *)stream0_in; __m128i *stream1_128i_in = (__m128i *)stream1_in; __m128i *ch_mag_128i = (__m128i *)ch_mag; - int i,j; - ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16) - THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16) - FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15) - SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15) - ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) - ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15) - THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15) - FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15) - SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15) - FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14 - THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14 - TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15) - TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14 - SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15) - NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15) - THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15) - FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15) - ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15) - SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12 + __m128i ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16) + __m128i THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16) + __m128i FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15) + __m128i SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15) + __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) + __m128i FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14 + __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14 + __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15) + __m128i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14 + __m128i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15) + __m128i NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15) + __m128i THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15) + __m128i FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15) + __m128i ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15) + + + __m128i ch_mag_des; + __m128i ch_mag_98_over_42_with_sigma2; + __m128i ch_mag_74_over_42_with_sigma2; + __m128i ch_mag_58_over_42_with_sigma2; + __m128i ch_mag_50_over_42_with_sigma2; + __m128i ch_mag_34_over_42_with_sigma2; + __m128i ch_mag_18_over_42_with_sigma2; + __m128i ch_mag_26_over_42_with_sigma2; + __m128i ch_mag_10_over_42_with_sigma2; + __m128i ch_mag_2_over_42_with_sigma2; + __m128i y0r_one_over_sqrt_21; + __m128i y0r_three_over_sqrt_21; + __m128i y0r_five_over_sqrt_21; + __m128i y0r_seven_over_sqrt_21; + __m128i y0i_one_over_sqrt_21; + __m128i y0i_three_over_sqrt_21; + __m128i y0i_five_over_sqrt_21; + __m128i y0i_seven_over_sqrt_21; +#elif defined(__arm__) + +#endif + + int i,j; for (i=0; i<length>>2; i+=2) { +#if defined(__x86_64) || defined(__i386__) // Get rho xmm0 = rho01_128i[i]; xmm1 = rho01_128i[i+1]; @@ -4734,6 +4970,7 @@ void qam64_qpsk(short *stream0_in, y2i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0); + // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs // RE 1 j = 24*i; @@ -4792,25 +5029,29 @@ void qam64_qpsk(short *stream0_in, stream0_out[j + 45] = ((short *)&y0i)[7]; stream0_out[j + 46] = ((short *)&y1i)[7]; stream0_out[j + 47] = ((short *)&y2i)[7]; +#elif defined(__arm__) + +#endif } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } int dlsch_64qam_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - int **rxdataF_comp_i, - int **dl_ch_mag, - int **rho_i, - short *dlsch_llr, - unsigned char symbol, - unsigned char first_symbol_flag, - unsigned short nb_rb, + int32_t **rxdataF_comp, + int32_t **rxdataF_comp_i, + int32_t **dl_ch_mag, + int32_t **rho_i, + int16_t *dlsch_llr, + uint8_t symbol, + uint8_t first_symbol_flag, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr16p) + int16_t **llr16p) { int16_t *rxF = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; @@ -4867,61 +5108,80 @@ void qam64_qam16(short *stream0_in, short *stream0_out, short *rho01, int length - ) + ) { /* - Author: S. Wagner - Date: 31-07-12 - - Input: - stream0_in: MF filter for 1st stream, i.e., y0=h0'*y - stream1_in: MF filter for 2nd stream, i.e., y1=h1'*y - ch_mag: 4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - ch_mag_i: 4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - rho01: Channel cross correlation, i.e., h1'*h0 - - Output: - stream0_out: output LLRs for 1st stream + Author: S. Wagner + Date: 31-07-12 + + Input: + stream0_in: MF filter for 1st stream, i.e., y0=h0'*y + stream1_in: MF filter for 2nd stream, i.e., y1=h1'*y + ch_mag: 4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + ch_mag_i: 4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + rho01: Channel cross correlation, i.e., h1'*h0 + + Output: + stream0_out: output LLRs for 1st stream */ +#if defined(__x86_64__) || defined(__i386__) + __m128i *rho01_128i = (__m128i *)rho01; __m128i *stream0_128i_in = (__m128i *)stream0_in; __m128i *stream1_128i_in = (__m128i *)stream1_in; __m128i *ch_mag_128i = (__m128i *)ch_mag; __m128i *ch_mag_128i_i = (__m128i *)ch_mag_i; + __m128i ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16) + __m128i THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16) + __m128i FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15) + __m128i SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15) + __m128i FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14 + __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14 + __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15) + __m128i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14 + __m128i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15) + __m128i NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15) + __m128i THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15) + __m128i FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15) + __m128i ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15) + __m128i ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15) + __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) + __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) + + + __m128i ch_mag_int; + __m128i ch_mag_des; + __m128i ch_mag_98_over_42_with_sigma2; + __m128i ch_mag_74_over_42_with_sigma2; + __m128i ch_mag_58_over_42_with_sigma2; + __m128i ch_mag_50_over_42_with_sigma2; + __m128i ch_mag_34_over_42_with_sigma2; + __m128i ch_mag_18_over_42_with_sigma2; + __m128i ch_mag_26_over_42_with_sigma2; + __m128i ch_mag_10_over_42_with_sigma2; + __m128i ch_mag_2_over_42_with_sigma2; + __m128i y0r_one_over_sqrt_21; + __m128i y0r_three_over_sqrt_21; + __m128i y0r_five_over_sqrt_21; + __m128i y0r_seven_over_sqrt_21; + __m128i y0i_one_over_sqrt_21; + __m128i y0i_three_over_sqrt_21; + __m128i y0i_five_over_sqrt_21; + __m128i y0i_seven_over_sqrt_21; + +#elif defined(__arm__) + +#endif int i,j; - ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16) - THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16) - FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15) - SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15) - ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) - ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15) - THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15) - FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15) - SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15) - FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14 - THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14 - TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15) - TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14 - SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15) - NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15) - THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15) - FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15) - ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15) - SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12 - ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15) - THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15) - SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) - // ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(7327); // round(1/sqrt(10)*2^15) - // THREE_OVER_SQRT_10 = _mm_set1_epi16(21981); // round(3/sqrt(10)*2^15) - // SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) Q3.13 for (i=0; i<length>>2; i+=2) { +#if defined(__x86_64__) || defined(__i386__) // Get rho xmm0 = rho01_128i[i]; xmm1 = rho01_128i[i+1]; @@ -6252,6 +6512,7 @@ void qam64_qam16(short *stream0_in, y2i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0); + // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs // RE 1 j = 24*i; @@ -6310,26 +6571,32 @@ void qam64_qam16(short *stream0_in, stream0_out[j + 45] = ((short *)&y0i)[7]; stream0_out[j + 46] = ((short *)&y1i)[7]; stream0_out[j + 47] = ((short *)&y2i)[7]; + +#elif defined(__arm__) + +#endif } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } int dlsch_64qam_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - int **rxdataF_comp_i, - int **dl_ch_mag, - int **dl_ch_mag_i, - int **rho_i, - short *dlsch_llr, - unsigned char symbol, - unsigned char first_symbol_flag, - unsigned short nb_rb, + int32_t **rxdataF_comp, + int32_t **rxdataF_comp_i, + int32_t **dl_ch_mag, + int32_t **dl_ch_mag_i, + int32_t **rho_i, + int16_t *dlsch_llr, + uint8_t symbol, + uint8_t first_symbol_flag, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr16p) + int16_t **llr16p) { int16_t *rxF = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; @@ -6386,54 +6653,85 @@ void qam64_qam64(short *stream0_in, short *stream0_out, short *rho01, int length - ) + ) { /* - Author: S. Wagner - Date: 31-07-12 - - Input: - stream0_in: MF filter for 1st stream, i.e., y0=h0'*y - stream1_in: MF filter for 2nd stream, i.e., y1=h1'*y - ch_mag: 4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - ch_mag_i: 4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc - rho01: Channel cross correlation, i.e., h1'*h0 - - Output: - stream0_out: output LLRs for 1st stream + Author: S. Wagner + Date: 31-07-12 + + Input: + stream0_in: MF filter for 1st stream, i.e., y0=h0'*y + stream1_in: MF filter for 2nd stream, i.e., y1=h1'*y + ch_mag: 4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + ch_mag_i: 4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc + rho01: Channel cross correlation, i.e., h1'*h0 + + Output: + stream0_out: output LLRs for 1st stream */ +#if defined(__x86_64__) || defined(__i386__) + __m128i *rho01_128i = (__m128i *)rho01; __m128i *stream0_128i_in = (__m128i *)stream0_in; __m128i *stream1_128i_in = (__m128i *)stream1_in; __m128i *ch_mag_128i = (__m128i *)ch_mag; __m128i *ch_mag_128i_i = (__m128i *)ch_mag_i; + __m128i ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16) + __m128i THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16) + __m128i FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15) + __m128i SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(7/sqrt(42)*2^14) Q2.14 + __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) + __m128i ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15) + __m128i THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15) + __m128i FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15) + __m128i SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15) + __m128i FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14 + __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14 + __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15) + __m128i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14 + __m128i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15) + __m128i NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15) + __m128i THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15) + __m128i FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15) + __m128i ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15) + __m128i SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12 + + __m128i ch_mag_des; + __m128i ch_mag_int; + __m128i ch_mag_98_over_42_with_sigma2; + __m128i ch_mag_74_over_42_with_sigma2; + __m128i ch_mag_58_over_42_with_sigma2; + __m128i ch_mag_50_over_42_with_sigma2; + __m128i ch_mag_34_over_42_with_sigma2; + __m128i ch_mag_18_over_42_with_sigma2; + __m128i ch_mag_26_over_42_with_sigma2; + __m128i ch_mag_10_over_42_with_sigma2; + __m128i ch_mag_2_over_42_with_sigma2; + __m128i y0r_one_over_sqrt_21; + __m128i y0r_three_over_sqrt_21; + __m128i y0r_five_over_sqrt_21; + __m128i y0r_seven_over_sqrt_21; + __m128i y0i_one_over_sqrt_21; + __m128i y0i_three_over_sqrt_21; + __m128i y0i_five_over_sqrt_21; + __m128i y0i_seven_over_sqrt_21; + __m128i ch_mag_int_with_sigma2; + __m128i two_ch_mag_int_with_sigma2; + __m128i three_ch_mag_int_with_sigma2; +#elif defined(__arm__) + +#endif + int i,j; - ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16) - THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16) - FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15) - SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(7/sqrt(42)*2^14) Q2.14 - ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15) - ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15) - THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15) - FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15) - SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15) - FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14 - THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14 - TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15) - TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14 - SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15) - NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15) - THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15) - FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15) - ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15) - SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12 for (i=0; i<length>>2; i+=2) { +#if defined(__x86_64__) || defined(__i386__) + // Get rho xmm0 = rho01_128i[i]; xmm1 = rho01_128i[i+1]; @@ -8027,6 +8325,7 @@ void qam64_qam64(short *stream0_in, y2i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0); + // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs // RE 1 j = 24*i; @@ -8085,26 +8384,32 @@ void qam64_qam64(short *stream0_in, stream0_out[j + 45] = ((short *)&y0i)[7]; stream0_out[j + 46] = ((short *)&y1i)[7]; stream0_out[j + 47] = ((short *)&y2i)[7]; + +#elif defined(__arm__) + +#endif + } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } int dlsch_64qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, - int **rxdataF_comp, - int **rxdataF_comp_i, - int **dl_ch_mag, - int **dl_ch_mag_i, - int **rho_i, - short *dlsch_llr, - unsigned char symbol, - unsigned char first_symbol_flag, - unsigned short nb_rb, + int32_t **rxdataF_comp, + int32_t **rxdataF_comp_i, + int32_t **dl_ch_mag, + int32_t **dl_ch_mag_i, + int32_t **rho_i, + int16_t *dlsch_llr, + uint8_t symbol, + uint8_t first_symbol_flag, + uint16_t nb_rb, uint16_t pbch_pss_sss_adjust, - short **llr16p) + int16_t **llr16p) { int16_t *rxF = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c index c4b0247d75..5012d0ac3f 100644 --- a/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c +++ b/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c @@ -262,7 +262,7 @@ int allocate_REs_in_RB(LTE_DL_FRAME_PARMS *frame_parms, switch (mod_order0) { case 2: //QPSK - //printf("%d(%d) : %d,%d => ",tti_offset,*jj,((int16_t*)&txdataF[0][tti_offset])[0],((int16_t*)&txdataF[0][tti_offset])[1]); +// printf("%d(%d) : %d,%d => ",tti_offset,*jj,((int16_t*)&txdataF[0][tti_offset])[0],((int16_t*)&txdataF[0][tti_offset])[1]); for (aa=0; aa<frame_parms->nb_antennas_tx; aa++) { ((int16_t*)&txdataF[aa][tti_offset])[0] += (x0[*jj]==1) ? (-gain_lin_QPSK) : gain_lin_QPSK; //I //b_i } @@ -275,7 +275,7 @@ int allocate_REs_in_RB(LTE_DL_FRAME_PARMS *frame_parms, *jj = *jj + 1; - // printf("%d,%d\n",((int16_t*)&txdataF[0][tti_offset])[0],((int16_t*)&txdataF[0][tti_offset])[1]); + // printf("%d,%d\n",((int16_t*)&txdataF[0][tti_offset])[0],((int16_t*)&txdataF[0][tti_offset])[1]); break; case 4: //16QAM diff --git a/openair1/PHY/LTE_TRANSPORT/pbch.c b/openair1/PHY/LTE_TRANSPORT/pbch.c index c4b18337a5..0582a63956 100755 --- a/openair1/PHY/LTE_TRANSPORT/pbch.c +++ b/openair1/PHY/LTE_TRANSPORT/pbch.c @@ -531,7 +531,7 @@ uint16_t pbch_extract(int **rxdataF, return(0); } -__m128i avg128; +//__m128i avg128; //compute average channel_level on each (TX,RX) antenna pair int pbch_channel_level(int **dl_ch_estimates_ext, @@ -541,7 +541,14 @@ int pbch_channel_level(int **dl_ch_estimates_ext, int16_t rb, nb_rb=6; uint8_t aatx,aarx; + +#if defined(__x86_64__) || defined(__i386__) + __m128i avg128; __m128i *dl_ch128; +#elif defined(__arm__) + int32x4_t avg128; + int16x8_t *dl_ch128; +#endif int avg1=0,avg2=0; uint32_t nsymb = (frame_parms->Ncp==0) ? 7:6; @@ -550,15 +557,23 @@ int pbch_channel_level(int **dl_ch_estimates_ext, for (aatx=0; aatx<4; aatx++) //frame_parms->nb_antennas_tx_eNB;aatx++) for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { //clear average level + +#if defined(__x86_64__) || defined(__i386__) avg128 = _mm_setzero_si128(); dl_ch128=(__m128i *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol_mod*6*12]; +#elif defined(__arm__) + avg128 = vdupq_n_s32(0); + dl_ch128=(int16x8_t *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol_mod*6*12]; +#endif for (rb=0; rb<nb_rb; rb++) { - +#if defined(__x86_64__) || defined(__i386__) avg128 = _mm_add_epi32(avg128,_mm_madd_epi16(dl_ch128[0],dl_ch128[0])); avg128 = _mm_add_epi32(avg128,_mm_madd_epi16(dl_ch128[1],dl_ch128[1])); avg128 = _mm_add_epi32(avg128,_mm_madd_epi16(dl_ch128[2],dl_ch128[2])); - +#elif defined(__arm__) +// to be filled in +#endif dl_ch128+=3; /* if (rb==0) { @@ -579,16 +594,19 @@ int pbch_channel_level(int **dl_ch_estimates_ext, //msg("Channel level : %d, %d\n",avg1, avg2); } - +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif return(avg2); } +#if defined(__x86_64__) || defined(__i386__) __m128i mmtmpP0,mmtmpP1,mmtmpP2,mmtmpP3; - +#elif defined(__arm__) +int16x8_t mmtmpP0,mmtmpP1,mmtmpP2,mmtmpP3; +#endif void pbch_channel_compensation(int **rxdataF_ext, int **dl_ch_estimates_ext, int **rxdataF_comp, @@ -599,21 +617,28 @@ void pbch_channel_compensation(int **rxdataF_ext, uint16_t rb,nb_rb=6; uint8_t aatx,aarx,symbol_mod; +#if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch128,*rxdataF128,*rxdataF_comp128; +#elif defined(__arm__) +#endif symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; for (aatx=0; aatx<4; aatx++) //frame_parms->nb_antennas_tx_eNB;aatx++) for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { +#if defined(__x86_64__) || defined(__i386__) dl_ch128 = (__m128i *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol_mod*6*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol_mod*6*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[(aatx<<1)+aarx][symbol_mod*6*12]; +#elif defined(__arm__) +// to be filled in +#endif for (rb=0; rb<nb_rb; rb++) { //printf("rb %d\n",rb); - +#if defined(__x86_64__) || defined(__i386__) // multiply by conjugated channel mmtmpP0 = _mm_madd_epi16(dl_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpP0); @@ -680,11 +705,15 @@ void pbch_channel_compensation(int **rxdataF_ext, rxdataF128+=2; rxdataF_comp128+=2; } +#elif defined(__arm__) +// to be filled in +#endif } } - +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } void pbch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, @@ -694,24 +723,38 @@ void pbch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, uint8_t aatx, symbol_mod; int i, nb_rb=6; +#if defined(__x86_64__) || defined(__i386__) __m128i *rxdataF_comp128_0,*rxdataF_comp128_1; - +#elif defined(__arm__) + int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1; +#endif symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; if (frame_parms->nb_antennas_rx>1) { for (aatx=0; aatx<4; aatx++) { //frame_parms->nb_antennas_tx_eNB;aatx++) { +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128_0 = (__m128i *)&rxdataF_comp[(aatx<<1)][symbol_mod*6*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp[(aatx<<1)+1][symbol_mod*6*12]; +#elif defined(__arm__) + rxdataF_comp128_0 = (int16x8_t *)&rxdataF_comp[(aatx<<1)][symbol_mod*6*12]; + rxdataF_comp128_1 = (int16x8_t *)&rxdataF_comp[(aatx<<1)+1][symbol_mod*6*12]; +#endif // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation) for (i=0; i<nb_rb*3; i++) { +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1)); +#elif defined(__arm__) + rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]); + +#endif } } } - +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } void pbch_scrambling(LTE_DL_FRAME_PARMS *frame_parms, @@ -806,9 +849,6 @@ void pbch_alamouti(LTE_DL_FRAME_PARMS *frame_parms, } - _mm_empty(); - _m_empty(); - } void pbch_quantize(int8_t *pbch_llr8, diff --git a/openair1/PHY/LTE_TRANSPORT/pmch.c b/openair1/PHY/LTE_TRANSPORT/pmch.c index 3082bff917..dfa38fca59 100644 --- a/openair1/PHY/LTE_TRANSPORT/pmch.c +++ b/openair1/PHY/LTE_TRANSPORT/pmch.c @@ -396,22 +396,33 @@ void mch_channel_level(int **dl_ch_estimates_ext, { int i,aarx,nre; +#if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch128,avg128; - +#elif defined(__arm__) + int32x4_t avg128; +#endif for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { - //clear average level +#if defined(__x86_64__) || defined(__i386__) + //clear average level avg128 = _mm_setzero_si128(); // 5 is always a symbol with no pilots for both normal and extended prefix dl_ch128=(__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; +#elif defined(__arm__) + +#endif if ((symbol == 2) || (symbol == 6) || (symbol == 10)) nre = (frame_parms->N_RB_DL*6); else nre = (frame_parms->N_RB_DL*12); for (i=0; i<(nre>>2); i++) { +#if defined(__x86_64__) || defined(__i386__) avg128 = _mm_add_epi32(avg128,_mm_madd_epi16(dl_ch128[0],dl_ch128[0])); +#elif defined(__arm__) + +#endif } avg[aarx] = (((int*)&avg128)[0] + @@ -422,9 +433,10 @@ void mch_channel_level(int **dl_ch_estimates_ext, // printf("Channel level : %d\n",avg[(aatx<<1)+aarx]); } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } void mch_channel_compensation(int **rxdataF_ext, @@ -439,14 +451,18 @@ void mch_channel_compensation(int **rxdataF_ext, { int aarx,nre,i; +#if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch128,*dl_ch_mag128,*dl_ch_mag128b,*rxdataF128,*rxdataF_comp128; __m128i mmtmpD0,mmtmpD1,mmtmpD2,mmtmpD3,QAM_amp128,QAM_amp128b; +#elif defined(__arm__) +#endif if ((symbol == 2) || (symbol == 6) || (symbol == 10)) nre = frame_parms->N_RB_DL*6; else nre = frame_parms->N_RB_DL*12; +#if defined(__x86_64__) || defined(__i386__) if (mod_order == 4) { QAM_amp128 = _mm_set1_epi16(QAM16_n1); // 2/sqrt(10) QAM_amp128b = _mm_setzero_si128(); @@ -454,21 +470,27 @@ void mch_channel_compensation(int **rxdataF_ext, QAM_amp128 = _mm_set1_epi16(QAM64_n1); // QAM_amp128b = _mm_set1_epi16(QAM64_n2); } +#elif defined(__arm__) - +#endif for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { +#if defined(__x86_64__) || defined(__i386__) + dl_ch128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag128 = (__m128i *)&dl_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag128b = (__m128i *)&dl_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; +#elif defined(__arm__) +#endif for (i=0; i<(nre>>2); i+=2) { if (mod_order>2) { // get channel amplitude if not QPSK +#if defined(__x86_64__) || defined(__i386__) mmtmpD0 = _mm_madd_epi16(dl_ch128[0],dl_ch128[0]); mmtmpD0 = _mm_srai_epi32(mmtmpD0,output_shift); @@ -498,8 +520,13 @@ void mch_channel_compensation(int **rxdataF_ext, dl_ch_mag128b[1] = _mm_mulhi_epi16(dl_ch_mag128b[1],QAM_amp128b); dl_ch_mag128b[1] = _mm_slli_epi16(dl_ch_mag128b[1],1); +#elif defined(__arm__) + +#endif } +#if defined(__x86_64__) || defined(__i386__) + // multiply by conjugated channel mmtmpD0 = _mm_madd_epi16(dl_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpD0); @@ -548,12 +575,17 @@ void mch_channel_compensation(int **rxdataF_ext, rxdataF128+=2; rxdataF_comp128+=2; +#elif defined(__arm__) +#endif } } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif + } void mch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, @@ -565,10 +597,15 @@ void mch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, int i; +#if defined(__x86_64__) || defined(__i386__) __m128i *rxdataF_comp128_0,*rxdataF_comp128_1,*dl_ch_mag128_0,*dl_ch_mag128_1,*dl_ch_mag128_0b,*dl_ch_mag128_1b; - +#elif defined(__arm__) + int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1,*dl_ch_mag128_0,*dl_ch_mag128_1,*dl_ch_mag128_0b,*dl_ch_mag128_1b; +#endif if (frame_parms->nb_antennas_rx>1) { +#if defined(__x86_64__) || defined(__i386__) + rxdataF_comp128_0 = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag128_0 = (__m128i *)&dl_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; @@ -576,16 +613,32 @@ void mch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, dl_ch_mag128_0b = (__m128i *)&dl_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag128_1b = (__m128i *)&dl_ch_magb[1][symbol*frame_parms->N_RB_DL*12]; +#elif defined(__arm__) + rxdataF_comp128_0 = (int16x8_t *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128_1 = (int16x8_t *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_0 = (int16x8_t *)&dl_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_1 = (int16x8_t *)&dl_ch_mag[1][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_0b = (int16x8_t *)&dl_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; + dl_ch_mag128_1b = (int16x8_t *)&dl_ch_magb[1][symbol*frame_parms->N_RB_DL*12]; + +#endif // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation) for (i=0; i<frame_parms->N_RB_DL*3; i++) { +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1)); dl_ch_mag128_0[i] = _mm_adds_epi16(_mm_srai_epi16(dl_ch_mag128_0[i],1),_mm_srai_epi16(dl_ch_mag128_1[i],1)); dl_ch_mag128_0b[i] = _mm_adds_epi16(_mm_srai_epi16(dl_ch_mag128_0b[i],1),_mm_srai_epi16(dl_ch_mag128_1b[i],1)); +#elif defined(__arm__) + rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]); + dl_ch_mag128_0[i] = vhaddq_s16(dl_ch_mag128_0[i],dl_ch_mag128_1[i]); + dl_ch_mag128_0b[i] = vhaddq_s16(dl_ch_mag128_0b[i],dl_ch_mag128_1b[i]); +#endif } } - +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } int mch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, @@ -626,8 +679,10 @@ int mch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, *llr32p = (short *)llr32; +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif return(0); } @@ -644,22 +699,38 @@ void mch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, int16_t **llr32p) { +#if defined(__x86_64__) || defined(__i386__) __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag; __m128i llr128[2],xmm0; + uint32_t *llr32; +#elif defined(__arm__) + int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; + int16x8_t *ch_mag; + int16x8_t llr128[2],xmm0; + int16_t *llr16; +#endif int i,len; unsigned char len_mod4=0; - uint32_t *llr32; +#if defined(__x86_64__) || defined(__i386__) if (symbol==2) { llr32 = (uint32_t*)dlsch_llr; } else { llr32 = (uint32_t*)*llr32p; } - - +#elif defined(__arm__) + if (symbol==2) { + llr16 = (int16_t*)dlsch_llr; + } else { + llr16 = (int16_t*)*llr32p; + } +#endif +#if defined(__x86_64__) || defined(__i386__) ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; - +#elif defined(__arm__) + ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; +#endif if ((symbol==2) || (symbol==6) || (symbol==10)) { len = frame_parms->N_RB_DL*6; } else { @@ -680,6 +751,7 @@ void mch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, for (i=0; i<len; i++) { +#if defined(__x86_64__) || defined(__i386__) xmm0 = _mm_abs_epi16(rxF[i]); xmm0 = _mm_subs_epi16(ch_mag[i],xmm0); @@ -695,10 +767,38 @@ void mch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, llr32[6] = ((uint32_t *)&llr128[1])[2]; llr32[7] = ((uint32_t *)&llr128[1])[3]; llr32+=8; + +#elif defined(__arm__) + xmm0 = vabsq_s16(rxF[i]); + xmm0 = vsubq_s16(ch_mag[i],xmm0); + + // lambda_1=y_R, lambda_2=|y_R|-|h|^2, lamda_3=y_I, lambda_4=|y_I|-|h|^2 + + llr16[0] = vgetq_lane_s16(rxF[i],0); + llr16[1] = vgetq_lane_s16(xmm0,0); + llr16[2] = vgetq_lane_s16(rxF[i],1); + llr16[3] = vgetq_lane_s16(xmm0,1); + llr16[4] = vgetq_lane_s16(rxF[i],2); + llr16[5] = vgetq_lane_s16(xmm0,2); + llr16[6] = vgetq_lane_s16(rxF[i],2); + llr16[7] = vgetq_lane_s16(xmm0,3); + llr16[8] = vgetq_lane_s16(rxF[i],4); + llr16[9] = vgetq_lane_s16(xmm0,4); + llr16[10] = vgetq_lane_s16(rxF[i],5); + llr16[11] = vgetq_lane_s16(xmm0,5); + llr16[12] = vgetq_lane_s16(rxF[i],6); + llr16[13] = vgetq_lane_s16(xmm0,6); + llr16[14] = vgetq_lane_s16(rxF[i],7); + llr16[15] = vgetq_lane_s16(xmm0,7); + llr16+=16; +#endif + } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } //---------------------------------------------------------------------------------------------- @@ -714,8 +814,13 @@ void mch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, short **llr_save) { +#if defined(__x86_64__) || defined(__i386__) __m128i xmm1,xmm2,*ch_mag,*ch_magb; __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; +#elif defined(__arm__) + int16x8_t xmm1,xmm2,*ch_mag,*ch_magb; + int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; +#endif int i,len,len2; // int j=0; @@ -728,9 +833,13 @@ void mch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, else llr = *llr_save; +#if defined(__x86_64__) || defined(__i386__) ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; ch_magb = (__m128i*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)]; - +#elif defined(__arm__) + ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; + ch_magb = (int16x8_t*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)]; +#endif if ((symbol==2) || (symbol==6) || (symbol==10)) { len = frame_parms->N_RB_DL*6; } else { @@ -747,11 +856,18 @@ void mch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, for (i=0; i<len2; i++) { - +#if defined(__x86_64__) || defined(__i386__) xmm1 = _mm_abs_epi16(rxF[i]); xmm1 = _mm_subs_epi16(ch_mag[i],xmm1); xmm2 = _mm_abs_epi16(xmm1); xmm2 = _mm_subs_epi16(ch_magb[i],xmm2); +#elif defined(__arm__) + xmm1 = vabsq_s16(rxF[i]); + xmm1 = vsubq_s16(ch_mag[i],xmm1); + xmm2 = vabsq_s16(xmm1); + xmm2 = vsubq_s16(ch_magb[i],xmm2); +#endif + /* printf("pmch i: %d => mag (%d,%d) (%d,%d)\n",i,((short *)&ch_mag[i])[0],((short *)&ch_magb[i])[0], ((short *)&rxF[i])[0],((short *)&rxF[i])[1]); @@ -771,41 +887,68 @@ void mch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, */ llr2[0] = ((short *)&rxF[i])[0]; llr2[1] = ((short *)&rxF[i])[1]; +#if defined(__x86_64__) || defined(__i386__) llr2[2] = _mm_extract_epi16(xmm1,0); llr2[3] = _mm_extract_epi16(xmm1,1);//((short *)&xmm1)[j+1]; llr2[4] = _mm_extract_epi16(xmm2,0);//((short *)&xmm2)[j]; llr2[5] = _mm_extract_epi16(xmm2,1);//((short *)&xmm2)[j+1]; +#elif defined(__arm__) + llr2[2] = vgetq_lane_s16(xmm1,0); + llr2[3] = vgetq_lane_s16(xmm1,1);//((short *)&xmm1)[j+1]; + llr2[4] = vgetq_lane_s16(xmm2,0);//((short *)&xmm2)[j]; + llr2[5] = vgetq_lane_s16(xmm2,1);//((short *)&xmm2)[j+1]; +#endif llr2+=6; llr2[0] = ((short *)&rxF[i])[2]; llr2[1] = ((short *)&rxF[i])[3]; +#if defined(__x86_64__) || defined(__i386__) llr2[2] = _mm_extract_epi16(xmm1,2); llr2[3] = _mm_extract_epi16(xmm1,3);//((short *)&xmm1)[j+1]; llr2[4] = _mm_extract_epi16(xmm2,2);//((short *)&xmm2)[j]; llr2[5] = _mm_extract_epi16(xmm2,3);//((short *)&xmm2)[j+1]; - +#elif defined(__arm__) + llr2[2] = vgetq_lane_s16(xmm1,2); + llr2[3] = vgetq_lane_s16(xmm1,3);//((short *)&xmm1)[j+1]; + llr2[4] = vgetq_lane_s16(xmm2,2);//((short *)&xmm2)[j]; + llr2[5] = vgetq_lane_s16(xmm2,3);//((short *)&xmm2)[j+1]; +#endif llr2+=6; llr2[0] = ((short *)&rxF[i])[4]; llr2[1] = ((short *)&rxF[i])[5]; +#if defined(__x86_64__) || defined(__i386__) llr2[2] = _mm_extract_epi16(xmm1,4); llr2[3] = _mm_extract_epi16(xmm1,5);//((short *)&xmm1)[j+1]; llr2[4] = _mm_extract_epi16(xmm2,4);//((short *)&xmm2)[j]; llr2[5] = _mm_extract_epi16(xmm2,5);//((short *)&xmm2)[j+1]; - +#elif defined(__arm__) + llr2[2] = vgetq_lane_s16(xmm1,4); + llr2[3] = vgetq_lane_s16(xmm1,5);//((short *)&xmm1)[j+1]; + llr2[4] = vgetq_lane_s16(xmm2,4);//((short *)&xmm2)[j]; + llr2[5] = vgetq_lane_s16(xmm2,5);//((short *)&xmm2)[j+1]; +#endif llr2+=6; llr2[0] = ((short *)&rxF[i])[6]; llr2[1] = ((short *)&rxF[i])[7]; +#if defined(__x86_64__) || defined(__i386__) llr2[2] = _mm_extract_epi16(xmm1,6); llr2[3] = _mm_extract_epi16(xmm1,7);//((short *)&xmm1)[j+1]; llr2[4] = _mm_extract_epi16(xmm2,6);//((short *)&xmm2)[j]; llr2[5] = _mm_extract_epi16(xmm2,7);//((short *)&xmm2)[j+1]; - +#elif defined(__arm__) + llr2[2] = vgetq_lane_s16(xmm1,6); + llr2[3] = vgetq_lane_s16(xmm1,7);//((short *)&xmm1)[j+1]; + llr2[4] = vgetq_lane_s16(xmm2,6);//((short *)&xmm2)[j]; + llr2[5] = vgetq_lane_s16(xmm2,7);//((short *)&xmm2)[j+1]; +#endif llr2+=6; } *llr_save = llr; +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } int avg_pmch[4]; diff --git a/openair1/PHY/LTE_TRANSPORT/prach.c b/openair1/PHY/LTE_TRANSPORT/prach.c index 4749598616..73c3957b6d 100644 --- a/openair1/PHY/LTE_TRANSPORT/prach.c +++ b/openair1/PHY/LTE_TRANSPORT/prach.c @@ -998,10 +998,7 @@ int32_t generate_prach( PHY_VARS_UE *phy_vars_ue, uint8_t eNB_id, uint8_t subfra return signal_energy( (int*)prach, 256 ); } - - - -__m128i mmtmpX0,mmtmpX1,mmtmpX2,mmtmpX3; +//__m128i mmtmpX0,mmtmpX1,mmtmpX2,mmtmpX3; void rx_prach(PHY_VARS_eNB *phy_vars_eNB,uint8_t subframe,uint16_t *preamble_energy_list, uint16_t *preamble_delay_list, uint16_t Nf, uint8_t tdd_mapindex) { diff --git a/openair1/PHY/LTE_TRANSPORT/proto.h b/openair1/PHY/LTE_TRANSPORT/proto.h index 5fe7518004..0f671695d9 100644 --- a/openair1/PHY/LTE_TRANSPORT/proto.h +++ b/openair1/PHY/LTE_TRANSPORT/proto.h @@ -48,7 +48,7 @@ * @{ */ -/** \fn free_eNB_dlsch(LTE_eNB_DLSCH_t *dlsch) +/** \fn free_eNB_dlsch(LTE_eNB_DLSCH_t *dlsch,unsigned char N_RB_DL) \brief This function frees memory allocated for a particular DLSCH at eNB @param dlsch Pointer to DLSCH to be removed */ @@ -74,9 +74,7 @@ void free_ue_dlsch(LTE_UE_DLSCH_t *dlsch); LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint8_t max_turbo_iterations,uint8_t N_RB_DL, uint8_t abstraction_flag); -void free_eNB_dlsch(LTE_eNB_DLSCH_t *dlsch); -LTE_eNB_ULSCH_t *new_eNB_ulsch(uint8_t Mdlharq,uint8_t max_turbo_iterations,uint8_t N_RB_UL, uint8_t abstraction_flag); void clean_eNb_ulsch(LTE_eNB_ULSCH_t *ulsch, uint8_t abstraction_flag); diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c index c6c7876c09..3ec50fda82 100644 --- a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c +++ b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c @@ -52,15 +52,19 @@ //extern int **ulchmag_eren; //eren - static short jitter[8] __attribute__ ((aligned(16))) = {1,0,0,1,0,1,1,0}; static short jitterc[8] __attribute__ ((aligned(16))) = {0,1,1,0,1,0,0,1}; #ifndef OFDMA_ULSCH void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) { - +#if defined(__x86_64__) || defined(__i386__) __m128i idft_in128[3][1200],idft_out128[3][1200]; + __m128i norm128; +#elif defined(__arm__) + int16x8_t idft_in128[3][1200],idft_out128[3][1200]; + int16x8_t norm128; +#endif int16_t *idft_in0=(int16_t*)idft_in128[0],*idft_out0=(int16_t*)idft_out128[0]; int16_t *idft_in1=(int16_t*)idft_in128[1],*idft_out1=(int16_t*)idft_out128[1]; int16_t *idft_in2=(int16_t*)idft_in128[2],*idft_out2=(int16_t*)idft_out128[2]; @@ -68,7 +72,7 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) uint32_t *z0,*z1,*z2,*z3,*z4,*z5,*z6,*z7,*z8,*z9,*z10=NULL,*z11=NULL; int i,ip; - __m128i norm128; + // printf("Doing lte_idft for Msc_PUSCH %d\n",Msc_PUSCH); @@ -108,6 +112,7 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) // conjugate input for (i=0; i<(Msc_PUSCH>>2); i++) { +#if defined(__x86_64__)||defined(__i386__) *&(((__m128i*)z0)[i])=_mm_sign_epi16(*&(((__m128i*)z0)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z1)[i])=_mm_sign_epi16(*&(((__m128i*)z1)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z2)[i])=_mm_sign_epi16(*&(((__m128i*)z2)[i]),*(__m128i*)&conjugate2[0]); @@ -119,10 +124,29 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) *&(((__m128i*)z8)[i])=_mm_sign_epi16(*&(((__m128i*)z8)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z9)[i])=_mm_sign_epi16(*&(((__m128i*)z9)[i]),*(__m128i*)&conjugate2[0]); - if (frame_parms->Ncp==0) { + if (frame_parms->Ncp==NORMAL) { *&(((__m128i*)z10)[i])=_mm_sign_epi16(*&(((__m128i*)z10)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z11)[i])=_mm_sign_epi16(*&(((__m128i*)z11)[i]),*(__m128i*)&conjugate2[0]); } +#elif defined(__arm__) + *&(((int16x8_t*)z0)[i])=vmulq_s16(*&(((int16x8_t*)z0)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z1)[i])=vmulq_s16(*&(((int16x8_t*)z1)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z2)[i])=vmulq_s16(*&(((int16x8_t*)z2)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z3)[i])=vmulq_s16(*&(((int16x8_t*)z3)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z4)[i])=vmulq_s16(*&(((int16x8_t*)z4)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z5)[i])=vmulq_s16(*&(((int16x8_t*)z5)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z6)[i])=vmulq_s16(*&(((int16x8_t*)z6)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z7)[i])=vmulq_s16(*&(((int16x8_t*)z7)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z8)[i])=vmulq_s16(*&(((int16x8_t*)z8)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z9)[i])=vmulq_s16(*&(((int16x8_t*)z9)[i]),*(int16x8_t*)&conjugate2[0]); + + + if (frame_parms->Ncp==NORMAL) { + *&(((int16x8_t*)z10)[i])=vmulq_s16(*&(((int16x8_t*)z10)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z11)[i])=vmulq_s16(*&(((int16x8_t*)z11)[i]),*(int16x8_t*)&conjugate2[0]); + } + +#endif } for (i=0,ip=0; i<Msc_PUSCH; i++,ip+=4) { @@ -150,23 +174,21 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) dft12((int16_t *)idft_in1,(int16_t *)idft_out1); dft12((int16_t *)idft_in2,(int16_t *)idft_out2); - /* - dft12f(&((__m128i *)idft_in0)[0],&((__m128i *)idft_in0)[1],&((__m128i *)idft_in0)[2],&((__m128i *)idft_in0)[3],&((__m128i *)idft_in0)[4],&((__m128i *)idft_in0)[5],&((__m128i *)idft_in0)[6],&((__m128i *)idft_in0)[7],&((__m128i *)idft_in0)[8],&((__m128i *)idft_in0)[9],&((__m128i *)idft_in0)[10],&((__m128i *)idft_in0)[11], - &((__m128i *)idft_out0)[0],&((__m128i *)idft_out0)[1],&((__m128i *)idft_out0)[2],&((__m128i *)idft_out0)[3],&((__m128i *)idft_out0)[4],&((__m128i *)idft_out0)[5],&((__m128i *)idft_out0)[6],&((__m128i *)idft_out0)[7],&((__m128i *)idft_out0)[8],&((__m128i *)idft_out0)[9],&((__m128i *)idft_out0)[10],&((__m128i *)idft_out0)[11]); - - dft12f(&((__m128i *)idft_in1)[0],&((__m128i *)idft_in1)[1],&((__m128i *)idft_in1)[2],&((__m128i *)idft_in1)[3],&((__m128i *)idft_in1)[4],&((__m128i *)idft_in1)[5],&((__m128i *)idft_in1)[6],&((__m128i *)idft_in1)[7],&((__m128i *)idft_in1)[8],&((__m128i *)idft_in1)[9],&((__m128i *)idft_in1)[10],&((__m128i *)idft_in1)[11], - &((__m128i *)idft_out1)[0],&((__m128i *)idft_out1)[1],&((__m128i *)idft_out1)[2],&((__m128i *)idft_out1)[3],&((__m128i *)idft_out1)[4],&((__m128i *)idft_out1)[5],&((__m128i *)idft_out1)[6],&((__m128i *)idft_out1)[7],&((__m128i *)idft_out1)[8],&((__m128i *)idft_out1)[9],&((__m128i *)idft_out1)[10],&((__m128i *)idft_out1)[11]); - - dft12f(&((__m128i *)idft_in2)[0],&((__m128i *)idft_in2)[1],&((__m128i *)idft_in2)[2],&((__m128i *)idft_in2)[3],&((__m128i *)idft_in2)[4],&((__m128i *)idft_in2)[5],&((__m128i *)idft_in2)[6],&((__m128i *)idft_in2)[7],&((__m128i *)idft_in2)[8],&((__m128i *)idft_in2)[9],&((__m128i *)idft_in2)[10],&((__m128i *)idft_in2)[11], - &((__m128i *)idft_out2)[0],&((__m128i *)idft_out2)[1],&((__m128i *)idft_out2)[2],&((__m128i *)idft_out2)[3],&((__m128i *)idft_out2)[4],&((__m128i *)idft_out2)[5],&((__m128i *)idft_out2)[6],&((__m128i *)idft_out2)[7],&((__m128i *)idft_out2)[8],&((__m128i *)idft_out2)[9],&((__m128i *)idft_out2)[10],&((__m128i *)idft_out2)[11]); - */ - +#if defined(__x86_64__)||defined(__i386__) norm128 = _mm_set1_epi16(9459); - +#elif defined(__arm__) + norm128 = vdupq_n_s16(9459); +#endif for (i=0; i<12; i++) { +#if defined(__x86_64__)||defined(__i386__) ((__m128i*)idft_out0)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)idft_out0)[i],norm128),1); ((__m128i*)idft_out1)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)idft_out1)[i],norm128),1); ((__m128i*)idft_out2)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)idft_out2)[i],norm128),1); +#elif defined(__arm__) + ((int16x8_t*)idft_out0)[i] = vqdmulhq_s16(((int16x8_t*)idft_out0)[i],norm128); + ((int16x8_t*)idft_out1)[i] = vqdmulhq_s16(((int16x8_t*)idft_out1)[i],norm128); + ((int16x8_t*)idft_out2)[i] = vqdmulhq_s16(((int16x8_t*)idft_out2)[i],norm128); +#endif } break; @@ -398,6 +420,7 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) // conjugate output for (i=0; i<(Msc_PUSCH>>2); i++) { +#if defined(__x86_64__) || defined(__i386__) ((__m128i*)z0)[i]=_mm_sign_epi16(((__m128i*)z0)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z1)[i]=_mm_sign_epi16(((__m128i*)z1)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z2)[i]=_mm_sign_epi16(((__m128i*)z2)[i],*(__m128i*)&conjugate2[0]); @@ -409,12 +432,36 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH) ((__m128i*)z8)[i]=_mm_sign_epi16(((__m128i*)z8)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z9)[i]=_mm_sign_epi16(((__m128i*)z9)[i],*(__m128i*)&conjugate2[0]); - if (frame_parms->Ncp==0) { + if (frame_parms->Ncp==NORMAL) { ((__m128i*)z10)[i]=_mm_sign_epi16(((__m128i*)z10)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z11)[i]=_mm_sign_epi16(((__m128i*)z11)[i],*(__m128i*)&conjugate2[0]); } +#elif defined(__arm__) + *&(((int16x8_t*)z0)[i])=vmulq_s16(*&(((int16x8_t*)z0)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z1)[i])=vmulq_s16(*&(((int16x8_t*)z1)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z2)[i])=vmulq_s16(*&(((int16x8_t*)z2)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z3)[i])=vmulq_s16(*&(((int16x8_t*)z3)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z4)[i])=vmulq_s16(*&(((int16x8_t*)z4)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z5)[i])=vmulq_s16(*&(((int16x8_t*)z5)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z6)[i])=vmulq_s16(*&(((int16x8_t*)z6)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z7)[i])=vmulq_s16(*&(((int16x8_t*)z7)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z8)[i])=vmulq_s16(*&(((int16x8_t*)z8)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z9)[i])=vmulq_s16(*&(((int16x8_t*)z9)[i]),*(int16x8_t*)&conjugate2[0]); + + + if (frame_parms->Ncp==NORMAL) { + *&(((int16x8_t*)z10)[i])=vmulq_s16(*&(((int16x8_t*)z10)[i]),*(int16x8_t*)&conjugate2[0]); + *&(((int16x8_t*)z11)[i])=vmulq_s16(*&(((int16x8_t*)z11)[i]),*(int16x8_t*)&conjugate2[0]); + } + +#endif } +#if defined(__x86_64__) || defined(__i386__) + _mm_empty(); + _m_empty(); +#endif + } #endif @@ -429,10 +476,15 @@ int32_t ulsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, uint16_t nb_rb, int16_t **llrp) { - +#if defined(__x86_64__) || defined(__i386__) __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; - int32_t i; __m128i **llrp128 = (__m128i **)llrp; +#elif defined(__arm__) + int16x8_t *rxF= (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; + int16x8_t **llrp128 = (int16x8_t **)llrp; +#endif + + int i; // printf("qpsk llr for symbol %d (pos %d), llr offset %d\n",symbol,(symbol*frame_parms->N_RB_DL*12),llr128U-(__m128i*)ulsch_llr); @@ -443,8 +495,10 @@ int32_t ulsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, (*llrp128)++; } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif return(0); @@ -458,41 +512,64 @@ void ulsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, uint16_t nb_rb, int16_t **llrp) { +int i; +#if defined(__x86_64__) || defined(__i386__) __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag; __m128i mmtmpU0; __m128i **llrp128=(__m128i **)llrp; - - int32_t i; - // uint8_t symbol_mod; - - // printf("ulsch_rx.c: ulsch_16qam_llr: symbol %d\n",symbol); - - // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; - ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; - +#elif defined(__arm__) + int16x8_t *rxF=(int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; + int16x8_t *ch_mag; + int16x8_t xmm0; + int16_t **llrp16=llrp; + ch_mag =(int16x8_t*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; +#endif for (i=0; i<(nb_rb*3); i++) { - +#if defined(__x86_64__) || defined(__i386__) mmtmpU0 = _mm_abs_epi16(rxF[i]); // print_shorts("tmp0",&tmp0); mmtmpU0 = _mm_subs_epi16(ch_mag[i],mmtmpU0); - (*llrp128)[0] = _mm_unpacklo_epi32(rxF[i],mmtmpU0); (*llrp128)[1] = _mm_unpackhi_epi32(rxF[i],mmtmpU0); (*llrp128)+=2; +#elif defined(__arm__) + xmm0 = vabsq_s16(rxF[i]); + xmm0 = vqsubq_s16(ch_mag[i],xmm0); + (*llrp16)[0] = vgetq_lane_s16(rxF[i],0); + (*llrp16)[1] = vgetq_lane_s16(xmm0,0); + (*llrp16)[2] = vgetq_lane_s16(rxF[i],1); + (*llrp16)[3] = vgetq_lane_s16(xmm0,1); + (*llrp16)[4] = vgetq_lane_s16(rxF[i],2); + (*llrp16)[5] = vgetq_lane_s16(xmm0,2); + (*llrp16)[6] = vgetq_lane_s16(rxF[i],2); + (*llrp16)[7] = vgetq_lane_s16(xmm0,3); + (*llrp16)[8] = vgetq_lane_s16(rxF[i],4); + (*llrp16)[9] = vgetq_lane_s16(xmm0,4); + (*llrp16)[10] = vgetq_lane_s16(rxF[i],5); + (*llrp16)[11] = vgetq_lane_s16(xmm0,5); + (*llrp16)[12] = vgetq_lane_s16(rxF[i],6); + (*llrp16)[13] = vgetq_lane_s16(xmm0,6); + (*llrp16)[14] = vgetq_lane_s16(rxF[i],7); + (*llrp16)[15] = vgetq_lane_s16(xmm0,7); + (*llrp16)+=16; +#endif + // print_bytes("rxF[i]",&rxF[i]); // print_bytes("rxF[i+1]",&rxF[i+1]); } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); +#endif } @@ -505,27 +582,29 @@ void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, uint16_t nb_rb, int16_t **llrp) { + int i; + int32_t **llrp32=(int32_t **)llrp; +#if defined(__x86_64__) || defined(__i386) __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag,*ch_magb; - int32_t i; __m128i mmtmpU1,mmtmpU2; - int32_t **llrp32=(int32_t **)llrp; - - // uint8_t symbol_mod; - - - - // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; ch_magb =(__m128i*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)]; +#elif defined(__arm__) + int16x8_t *rxF=(int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; + int16x8_t *ch_mag,*ch_magb; + int16x8_t mmtmpU1,mmtmpU2; + ch_mag =(int16x8_t*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; + ch_magb =(int16x8_t*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)]; +#endif // printf("symbol %d: mag %d, magb %d\n",symbol,_mm_extract_epi16(ch_mag[0],0),_mm_extract_epi16(ch_magb[0],0)); for (i=0; i<(nb_rb*3); i++) { - +#if defined(__x86_64__) || defined(__i386__) mmtmpU1 = _mm_abs_epi16(rxF[i]); mmtmpU1 = _mm_subs_epi16(ch_mag[i],mmtmpU1); @@ -545,12 +624,34 @@ void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, (*llrp32)[9] = _mm_extract_epi32(rxF[i],3); (*llrp32)[10] = _mm_extract_epi32(mmtmpU1,3); (*llrp32)[11] = _mm_extract_epi32(mmtmpU2,3); +#elif defined(__arm__) + mmtmpU1 = vabsq_s16(rxF[i]); + + mmtmpU1 = vqsubq_s16(ch_mag[i],mmtmpU1); + + mmtmpU2 = vabsq_s16(mmtmpU1); + mmtmpU2 = vqsubq_s16(ch_magb[i],mmtmpU2); + + (*llrp32)[0] = vgetq_lane_s32((int32x4_t)rxF[i],0); + (*llrp32)[1] = vgetq_lane_s32((int32x4_t)mmtmpU1,0); + (*llrp32)[2] = vgetq_lane_s32((int32x4_t)mmtmpU2,0); + (*llrp32)[3] = vgetq_lane_s32((int32x4_t)rxF[i],1); + (*llrp32)[4] = vgetq_lane_s32((int32x4_t)mmtmpU1,1); + (*llrp32)[5] = vgetq_lane_s32((int32x4_t)mmtmpU2,1); + (*llrp32)[6] = vgetq_lane_s32((int32x4_t)rxF[i],2); + (*llrp32)[7] = vgetq_lane_s32((int32x4_t)mmtmpU1,2); + (*llrp32)[8] = vgetq_lane_s32((int32x4_t)mmtmpU2,2); + (*llrp32)[9] = vgetq_lane_s32((int32x4_t)rxF[i],3); + (*llrp32)[10] = vgetq_lane_s32((int32x4_t)mmtmpU1,3); + (*llrp32)[11] = vgetq_lane_s32((int32x4_t)mmtmpU2,3); + +#endif (*llrp32)+=12; } - +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, @@ -562,13 +663,20 @@ void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, { +#if defined(__x86_64__) || defined(__i386__) __m128i *rxdataF_comp128_0,*ul_ch_mag128_0,*ul_ch_mag128_0b; __m128i *rxdataF_comp128_1,*ul_ch_mag128_1,*ul_ch_mag128_1b; +#elif defined(__arm__) + int16x8_t *rxdataF_comp128_0,*ul_ch_mag128_0,*ul_ch_mag128_0b; + int16x8_t *rxdataF_comp128_1,*ul_ch_mag128_1,*ul_ch_mag128_1b; + +#endif int32_t i; if (frame_parms->nb_antennas_rx>1) { +#if defined(__x86_64__) || defined(__i386__) rxdataF_comp128_0 = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0 = (__m128i *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; @@ -582,15 +690,31 @@ void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, ul_ch_mag128_0[i] = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0[i],1),_mm_srai_epi16(ul_ch_mag128_1[i],1)); ul_ch_mag128_0b[i] = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0b[i],1),_mm_srai_epi16(ul_ch_mag128_1b[i],1)); rxdataF_comp128_0[i] = _mm_add_epi16(rxdataF_comp128_0[i],(*(__m128i*)&jitterc[0])); - } - // remove any bias (DC component after IDFT) - // ((uint32_t*)rxdataF_comp128_0)[0]=0; +#elif defined(__arm__) + rxdataF_comp128_0 = (int16x8_t *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128_1 = (int16x8_t *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12]; + ul_ch_mag128_0 = (int16x8_t *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; + ul_ch_mag128_1 = (int16x8_t *)&ul_ch_mag[1][symbol*frame_parms->N_RB_DL*12]; + ul_ch_mag128_0b = (int16x8_t *)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; + ul_ch_mag128_1b = (int16x8_t *)&ul_ch_magb[1][symbol*frame_parms->N_RB_DL*12]; + + // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation) + for (i=0; i<nb_rb*3; i++) { + rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]); + ul_ch_mag128_0[i] = vhaddq_s16(ul_ch_mag128_0[i],ul_ch_mag128_1[i]); + ul_ch_mag128_0b[i] = vhaddq_s16(ul_ch_mag128_0b[i],ul_ch_mag128_1b[i]); + rxdataF_comp128_0[i] = vqaddq_s16(rxdataF_comp128_0[i],(*(int16x8_t*)&jitterc[0])); + + +#endif + } } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } void ulsch_extract_rbs_single(int32_t **rxdataF, @@ -647,9 +771,6 @@ void ulsch_extract_rbs_single(int32_t **rxdataF, } } - _mm_empty(); - _m_empty(); - } void ulsch_correct_ext(int32_t **rxdataF_ext, @@ -687,42 +808,81 @@ void ulsch_channel_compensation(int32_t **rxdataF_ext, { uint16_t rb; + +#if defined(__x86_64__) || defined(__i386__) + __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128; uint8_t aarx;//,symbol_mod; __m128i mmtmpU0,mmtmpU1,mmtmpU2,mmtmpU3; #ifdef OFDMA_ULSCH __m128i QAM_amp128U,QAM_amp128bU; #endif - // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; - // printf("comp: symbol %d\n",symbol); +#elif defined(__arm__) + + int16x4_t *ul_ch128,*rxdataF128; + int16x8_t *ul_ch_mag128,*ul_ch_mag128b,*rxdataF_comp128; + + uint8_t aarx;//,symbol_mod; + int32x4_t mmtmpU0,mmtmpU1,mmtmpU0b,mmtmpU1b; +#ifdef OFDMA_ULSCH + int16x8_t mmtmpU2,mmtmpU3; + int16x8_t QAM_amp128U,QAM_amp128bU; +#endif + int16_t conj[4]__attribute__((aligned(16))) = {1,-1,1,-1}; + int32x4_t output_shift128 = vmovq_n_s32(-(int32_t)output_shift); + + -#ifdef ULSCH_OFDMA +#endif + +#ifdef OFDMA_ULSCH +#if defined(__x86_64__) || defined(__i386__) if (Qm == 4) QAM_amp128U = _mm_set1_epi16(QAM16_n1); else if (Qm == 6) { QAM_amp128U = _mm_set1_epi16(QAM64_n1); QAM_amp128bU = _mm_set1_epi16(QAM64_n2); } +#elif defined(__arm__) + if (Qm == 4) + QAM_amp128U = vdupq_n_s16(QAM16_n1); + else if (Qm == 6) { + QAM_amp128U = vdupq_n_s16(QAM64_n1); + QAM_amp128bU = vdupq_n_s16(QAM64_n2); + } +#endif #endif for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { +#if defined(__x86_64__) || defined(__i386__) + ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; +#elif defined(__arm__) + + ul_ch128 = (int16x4_t *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; + ul_ch_mag128 = (int16x8_t *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; + ul_ch_mag128b = (int16x8_t *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; + rxdataF128 = (int16x4_t *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; + rxdataF_comp128 = (int16x8_t *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; + +#endif for (rb=0; rb<nb_rb; rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); #ifdef OFDMA_ULSCH if (Qm>2) { // get channel amplitude if not QPSK +#if defined(__x86_64__) || defined(__i386__) mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); @@ -761,10 +921,36 @@ void ulsch_channel_compensation(int32_t **rxdataF_ext, ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU); ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate +#elif defined(__arm__) + mmtmpU0 = vmull_s16(ul_ch128[0], ul_ch128[0]); + mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128); + mmtmpU1 = vmull_s16(ul_ch128[1], ul_ch128[1]); + mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128); + mmtmpU2 = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1)); + mmtmpU0 = vmull_s16(ul_ch128[2], ul_ch128[2]); + mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128); + mmtmpU1 = vmull_s16(ul_ch128[3], ul_ch128[3]); + mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128); + mmtmpU3 = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1)); + mmtmpU0 = vmull_s16(ul_ch128[4], ul_ch128[4]); + mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128); + mmtmpU1 = vmull_s16(ul_ch128[5], ul_ch128[5]); + mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128); + mmtmpU4 = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1)); + + ul_ch_mag128b[0] = vqdmulhq_s16(mmtmpU2,QAM_amp128b); + ul_ch_mag128b[1] = vqdmulhq_s16(mmtmpU3,QAM_amp128b); + ul_ch_mag128[0] = vqdmulhq_s16(mmtmpU2,QAM_amp128); + ul_ch_mag128[1] = vqdmulhq_s16(mmtmpU3,QAM_amp128); + ul_ch_mag128b[2] = vqdmulhq_s16(mmtmpU4,QAM_amp128b); + ul_ch_mag128[2] = vqdmulhq_s16(mmtmpU4,QAM_amp128); +#endif } -#else +#else // SC-FDMA +// just compute channel magnitude without scaling, this is done after equalization for SC-FDMA +#if defined(__x86_64__) || defined(__i386__) mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); @@ -784,8 +970,29 @@ void ulsch_channel_compensation(int32_t **rxdataF_ext, ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); // printf("comp: symbol %d rb %d => %d,%d,%d (output_shift %d)\n",symbol,rb,*((int16_t*)&ul_ch_mag128[0]),*((int16_t*)&ul_ch_mag128[1]),*((int16_t*)&ul_ch_mag128[2]),output_shift); + + +#elif defined(__arm__) + mmtmpU0 = vmull_s16(ul_ch128[0], ul_ch128[0]); + mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128); + mmtmpU1 = vmull_s16(ul_ch128[1], ul_ch128[1]); + mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128); + ul_ch_mag128[0] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1)); + mmtmpU0 = vmull_s16(ul_ch128[2], ul_ch128[2]); + mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128); + mmtmpU1 = vmull_s16(ul_ch128[3], ul_ch128[3]); + mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128); + ul_ch_mag128[1] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1)); + mmtmpU0 = vmull_s16(ul_ch128[4], ul_ch128[4]); + mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128); + mmtmpU1 = vmull_s16(ul_ch128[5], ul_ch128[5]); + mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128); + ul_ch_mag128[2] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1)); + +#endif #endif +#if defined(__x86_64__) || defined(__i386__) // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); @@ -857,21 +1064,81 @@ void ulsch_channel_compensation(int32_t **rxdataF_ext, ul_ch_mag128b+=3; rxdataF128+=3; rxdataF_comp128+=3; - +#elif defined(__arm__) + mmtmpU0 = vmull_s16(ul_ch128[0], rxdataF128[0]); + //mmtmpU0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] + mmtmpU1 = vmull_s16(ul_ch128[1], rxdataF128[1]); + //mmtmpU1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] + mmtmpU0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0),vget_high_s32(mmtmpU0)), + vpadd_s32(vget_low_s32(mmtmpU1),vget_high_s32(mmtmpU1))); + //mmtmpU0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] + + mmtmpU0b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[0],*(int16x4_t*)conj)), rxdataF128[0]); + //mmtmpU0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] + mmtmpU1b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[1],*(int16x4_t*)conj)), rxdataF128[1]); + //mmtmpU0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] + mmtmpU1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0b),vget_high_s32(mmtmpU0b)), + vpadd_s32(vget_low_s32(mmtmpU1b),vget_high_s32(mmtmpU1b))); + //mmtmpU1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] + + mmtmpU0 = vqshlq_s32(mmtmpU0,-output_shift128); + mmtmpU1 = vqshlq_s32(mmtmpU1,-output_shift128); + rxdataF_comp128[0] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1)); + mmtmpU0 = vmull_s16(ul_ch128[2], rxdataF128[2]); + mmtmpU1 = vmull_s16(ul_ch128[3], rxdataF128[3]); + mmtmpU0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0),vget_high_s32(mmtmpU0)), + vpadd_s32(vget_low_s32(mmtmpU1),vget_high_s32(mmtmpU1))); + mmtmpU0b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[2],*(int16x4_t*)conj)), rxdataF128[2]); + mmtmpU1b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[3],*(int16x4_t*)conj)), rxdataF128[3]); + mmtmpU1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0b),vget_high_s32(mmtmpU0b)), + vpadd_s32(vget_low_s32(mmtmpU1b),vget_high_s32(mmtmpU1b))); + mmtmpU0 = vqshlq_s32(mmtmpU0,-output_shift128); + mmtmpU1 = vqshlq_s32(mmtmpU1,-output_shift128); + rxdataF_comp128[1] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1)); + + mmtmpU0 = vmull_s16(ul_ch128[4], rxdataF128[4]); + mmtmpU1 = vmull_s16(ul_ch128[5], rxdataF128[5]); + mmtmpU0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0),vget_high_s32(mmtmpU0)), + vpadd_s32(vget_low_s32(mmtmpU1),vget_high_s32(mmtmpU1))); + + mmtmpU0b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[4],*(int16x4_t*)conj)), rxdataF128[4]); + mmtmpU1b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[5],*(int16x4_t*)conj)), rxdataF128[5]); + mmtmpU1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0b),vget_high_s32(mmtmpU0b)), + vpadd_s32(vget_low_s32(mmtmpU1b),vget_high_s32(mmtmpU1b))); + + + mmtmpU0 = vqshlq_s32(mmtmpU0,-output_shift128); + mmtmpU1 = vqshlq_s32(mmtmpU1,-output_shift128); + rxdataF_comp128[2] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1)); + + // Add a jitter to compensate for the saturation in "packs" resulting in a bias on the DC after IDFT + rxdataF_comp128[0] = vqaddq_s16(rxdataF_comp128[0],(*(int16x8_t*)&jitter[0])); + rxdataF_comp128[1] = vqaddq_s16(rxdataF_comp128[1],(*(int16x8_t*)&jitter[0])); + rxdataF_comp128[2] = vqaddq_s16(rxdataF_comp128[2],(*(int16x8_t*)&jitter[0])); + + + ul_ch128+=6; + ul_ch_mag128+=3; + ul_ch_mag128b+=3; + rxdataF128+=6; + rxdataF_comp128+=3; + +#endif } } - +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } - +#if defined(__x86_64__) || defined(__i386__) __m128i QAM_amp128U_0,QAM_amp128bU_0,QAM_amp128U_1,QAM_amp128bU_1; +#endif void ulsch_channel_compensation_alamouti(int32_t **rxdataF_ext, // For Distributed Alamouti Combining int32_t **ul_ch_estimates_ext_0, @@ -888,7 +1155,7 @@ void ulsch_channel_compensation_alamouti(int32_t **rxdataF_ext, uint16_t nb_rb, uint8_t output_shift) { - +#if defined(__x86_64__) || defined(__i386__) uint16_t rb; __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1; uint8_t aarx;//,symbol_mod; @@ -1156,7 +1423,7 @@ void ulsch_channel_compensation_alamouti(int32_t **rxdataF_ext, _mm_empty(); _m_empty(); - +#endif } @@ -1176,6 +1443,7 @@ void ulsch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,// For Distributed Alamouti uint16_t nb_rb) { +#if defined(__x86_64__) || defined(__i386__) int16_t *rxF,*rxF0,*rxF1; __m128i *ch_mag,*ch_magb,*ch_mag0,*ch_mag1,*ch_mag0b,*ch_mag1b; uint8_t rb,re,aarx; @@ -1231,13 +1499,18 @@ void ulsch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,// For Distributed Alamouti _mm_empty(); _m_empty(); +#endif } +#if defined(__x86_64__) || defined(__i386__) __m128i avg128U; +#elif defined(__arm__) +int32x4_t avg128U; +#endif void ulsch_channel_level(int32_t **drs_ch_estimates_ext, LTE_DL_FRAME_PARMS *frame_parms, @@ -1247,11 +1520,14 @@ void ulsch_channel_level(int32_t **drs_ch_estimates_ext, int16_t rb; uint8_t aarx; +#if defined(__x86_64__) || defined(__i386__) __m128i *ul_ch128; - - +#elif defined(__arm__) + int16x4_t *ul_ch128; +#endif for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { //clear average level +#if defined(__x86_64__) || defined(__i386__) avg128U = _mm_setzero_si128(); ul_ch128=(__m128i *)drs_ch_estimates_ext[aarx]; @@ -1263,34 +1539,44 @@ void ulsch_channel_level(int32_t **drs_ch_estimates_ext, ul_ch128+=3; - if (rb==0) { - // print_shorts("ul_ch128",&ul_ch128[0]); - // print_shorts("ul_ch128",&ul_ch128[1]); - // print_shorts("ul_ch128",&ul_ch128[2]); - } } +#elif defined(__arm__) + avg128U = vdupq_n_s32(0); + ul_ch128=(int16x4_t *)drs_ch_estimates_ext[aarx]; + + for (rb=0; rb<nb_rb; rb++) { + + avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[0],ul_ch128[0])); + avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[1],ul_ch128[1])); + avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[2],ul_ch128[2])); + avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[3],ul_ch128[3])); + avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[4],ul_ch128[4])); + avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[5],ul_ch128[5])); + ul_ch128+=6; + + + } + +#endif + DevAssert( nb_rb ); avg[aarx] = (((int*)&avg128U)[0] + ((int*)&avg128U)[1] + ((int*)&avg128U)[2] + ((int*)&avg128U)[3])/(nb_rb*12); - // printf("Channel level : %d\n",avg[aarx]); } +#if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); - +#endif } int32_t avgU[2]; int32_t avgU_0[2],avgU_1[2]; // For the Distributed Alamouti Scheme -/* --> moved to LTE_eNB_PUSCH structure -int32_t ulsch_power[2]; -int32_t ulsch_power_0[2],ulsch_power_1[2];// For the distributed Alamouti Scheme -*/ void rx_ulsch(PHY_VARS_eNB *phy_vars_eNB, uint32_t sched_subframe, diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c b/openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c index 6ecca59ae4..46a49fb793 100644 --- a/openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c +++ b/openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c @@ -49,12 +49,15 @@ //#define DEBUG_ULSCH_MODULATION -__m128i dft_in128[4][1200],dft_in128[4][1200],dft_out128[4][1200],dft_out128[4][1200]; - #ifndef OFDMA_ULSCH void dft_lte(mod_sym_t *z,mod_sym_t *d, int32_t Msc_PUSCH, uint8_t Nsymb) { +#if defined(__x86_64__) || defined(__i386__) + __m128i dft_in128[4][1200],dft_out128[4][1200]; +#elif defined(__arm__) + int16x8_t dft_in128[4][1200],dft_out128[4][1200]; +#endif uint32_t *dft_in0=(uint32_t*)dft_in128[0],*dft_out0=(uint32_t*)dft_out128[0]; uint32_t *dft_in1=(uint32_t*)dft_in128[1],*dft_out1=(uint32_t*)dft_out128[1]; uint32_t *dft_in2=(uint32_t*)dft_in128[2],*dft_out2=(uint32_t*)dft_out128[2]; @@ -64,8 +67,11 @@ void dft_lte(mod_sym_t *z,mod_sym_t *d, int32_t Msc_PUSCH, uint8_t Nsymb) uint32_t *z0,*z1,*z2,*z3,*z4,*z5,*z6,*z7,*z8,*z9,*z10,*z11; uint32_t i,ip; +#if defined(__x86_64__) || defined(__i386__) __m128i norm128; - +#elif defined(__arm__) + int16x8_t norm128; +#endif // msg("Doing lte_dft for Msc_PUSCH %d\n",Msc_PUSCH); d0 = (uint32_t *)d; @@ -119,12 +125,21 @@ void dft_lte(mod_sym_t *z,mod_sym_t *d, int32_t Msc_PUSCH, uint8_t Nsymb) dft12f(&((__m128i *)dft_in2)[0],&((__m128i *)dft_in2)[1],&((__m128i *)dft_in2)[2],&((__m128i *)dft_in2)[3],&((__m128i *)dft_in2)[4],&((__m128i *)dft_in2)[5],&((__m128i *)dft_in2)[6],&((__m128i *)dft_in2)[7],&((__m128i *)dft_in2)[8],&((__m128i *)dft_in2)[9],&((__m128i *)dft_in2)[10],&((__m128i *)dft_in2)[11], &((__m128i *)dft_out2)[0],&((__m128i *)dft_out2)[1],&((__m128i *)dft_out2)[2],&((__m128i *)dft_out2)[3],&((__m128i *)dft_out2)[4],&((__m128i *)dft_out2)[5],&((__m128i *)dft_out2)[6],&((__m128i *)dft_out2)[7],&((__m128i *)dft_out2)[8],&((__m128i *)dft_out2)[9],&((__m128i *)dft_out2)[10],&((__m128i *)dft_out2)[11]); */ +#if defined(__x86_64__) || defined(__i386__) norm128 = _mm_set1_epi16(9459); - +#elif defined(__arm__) + norm128 = vdupq_n_s16(9459); +#endif for (i=0; i<12; i++) { +#if defined(__x86_64__) || defined(__i386__) ((__m128i*)dft_out0)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)dft_out0)[i],norm128),1); ((__m128i*)dft_out1)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)dft_out1)[i],norm128),1); ((__m128i*)dft_out2)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)dft_out2)[i],norm128),1); +#elif defined(__arm__) + ((int16x8_t*)dft_out0)[i] = vqdmulhq_s16(((int16x8_t*)dft_out0)[i],norm128); + ((int16x8_t*)dft_out1)[i] = vqdmulhq_s16(((int16x8_t*)dft_out1)[i],norm128); + ((int16x8_t*)dft_out2)[i] = vqdmulhq_s16(((int16x8_t*)dft_out2)[i],norm128); +#endif } break; diff --git a/openair1/PHY/MODULATION/slot_fep.c b/openair1/PHY/MODULATION/slot_fep.c index da313c9b32..c591c7b528 100644 --- a/openair1/PHY/MODULATION/slot_fep.c +++ b/openair1/PHY/MODULATION/slot_fep.c @@ -35,12 +35,19 @@ void rescale(int16_t *input,int length) { - +#if defined(__x86_64__) || defined(__i386__) __m128i *input128 = (__m128i *)input; +#elif defined(__arm__) + int16x8_t *input128 = (int16x8_t *)input; +#endif int i; for (i=0; i<length>>2; i++) { +#if defined(__x86_64__) || defined(__i386__) input128[i] = _mm_srai_epi16(input128[i],4); +#elif defined(__arm__) + input128[i] = vshrq_n_s16(input128[i],4); +#endif } } diff --git a/openair1/PHY/MODULATION/ul_7_5_kHz.c b/openair1/PHY/MODULATION/ul_7_5_kHz.c index 34e62e0f98..0e7c1785fa 100755 --- a/openair1/PHY/MODULATION/ul_7_5_kHz.c +++ b/openair1/PHY/MODULATION/ul_7_5_kHz.c @@ -48,7 +48,13 @@ void apply_7_5_kHz(PHY_VARS_UE *phy_vars_ue,int32_t*txdata,uint8_t slot) uint16_t len; uint32_t *kHz7_5ptr; +#if defined(__x86_64__) || defined(__i386__) __m128i *txptr128,*kHz7_5ptr128,mmtmp_re,mmtmp_im,mmtmp_re2,mmtmp_im2; +#elif defined(__arm__) + int16x8_t *txptr128,*kHz7_5ptr128; + int32x4_t mmtmp_re,mmtmp_im; + int32x4_t mmtmp0,mmtmp1; +#endif uint32_t slot_offset; // uint8_t aa; uint32_t i; @@ -90,13 +96,17 @@ void apply_7_5_kHz(PHY_VARS_UE *phy_vars_ue,int32_t*txdata,uint8_t slot) // slot_offset += (len/4); len = phy_vars_ue->lte_frame_parms.samples_per_tti/2; - //for (aa=0;aa<phy_vars_ue->lte_frame_parms.nb_antennas_tx;aa++) { +#if defined(__x86_64__) || defined(__i386__) txptr128 = (__m128i *)&txdata[slot_offset]; kHz7_5ptr128 = (__m128i *)kHz7_5ptr; +#elif defined(__arm__) + txptr128 = (int16x8_t*)&txdata[slot_offset]; + kHz7_5ptr128 = (int16x8_t*)kHz7_5ptr; +#endif // apply 7.5 kHz - // if (((slot>>1)&1) == 0) { // apply the sinusoid from the table directly for (i=0; i<(len>>2); i++) { +#if defined(__x86_64__) || defined(__i386__) mmtmp_re = _mm_madd_epi16(*txptr128,*kHz7_5ptr128); // Real part of complex multiplication (note: 7_5kHz signal is conjugated for this to work) mmtmp_im = _mm_shufflelo_epi16(*kHz7_5ptr128,_MM_SHUFFLE(2,3,0,1)); @@ -107,39 +117,32 @@ void apply_7_5_kHz(PHY_VARS_UE *phy_vars_ue,int32_t*txdata,uint8_t slot) mmtmp_im = _mm_srai_epi32(mmtmp_im,15); mmtmp_re2 = _mm_unpacklo_epi32(mmtmp_re,mmtmp_im); mmtmp_im2 = _mm_unpackhi_epi32(mmtmp_re,mmtmp_im); - /* - printf("%d: (%d,%d) (%d,%d) (%d,%d) (%d,%d) x (%d,%d) (%d,%d) (%d,%d) (%d,%d) => ", - i, - ((short*)txptr128)[0], - ((short*)txptr128)[1], - ((short*)txptr128)[2], - ((short*)txptr128)[3], - ((short*)txptr128)[4], - ((short*)txptr128)[5], - ((short*)txptr128)[6], - ((short*)txptr128)[7], - ((short*)kHz7_5ptr128)[0], - ((short*)kHz7_5ptr128)[1], - ((short*)kHz7_5ptr128)[2], - ((short*)kHz7_5ptr128)[3], - ((short*)kHz7_5ptr128)[4], - ((short*)kHz7_5ptr128)[5], - ((short*)kHz7_5ptr128)[6], - ((short*)kHz7_5ptr128)[7]);*/ txptr128[0] = _mm_packs_epi32(mmtmp_re2,mmtmp_im2); - /* printf("%(%d,%d) (%d,%d) (%d,%d) (%d,%d)\n", - ((short*)txptr128)[0], - ((short*)txptr128)[1], - ((short*)txptr128)[2], - ((short*)txptr128)[3], - ((short*)txptr128)[4], - ((short*)txptr128)[5], - ((short*)txptr128)[6], - ((short*)txptr128)[7]);*/ - + txptr128++; + kHz7_5ptr128++; +#elif defined(__arm__) + + mmtmp0 = vmull_s16(((int16x4_t*)txptr128)[0],((int16x4_t*)kHz7_5ptr128)[0]); + //mmtmp0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] + mmtmp1 = vmull_s16(((int16x4_t*)txptr128)[1],((int16x4_t*)kHz7_5ptr128)[1]); + //mmtmp1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] + mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + //mmtmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] + + mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)txptr128)[0],*(int16x4_t*)conjugate75_2)),((int16x4_t*)kHz7_5ptr128)[0]); + //mmtmp0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] + mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)txptr128)[1],*(int16x4_t*)conjugate75_2)), ((int16x4_t*)kHz7_5ptr128)[1]); + //mmtmp0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] + mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + //mmtmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] + + txptr128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im)); txptr128++; kHz7_5ptr128++; +#endif } //} @@ -154,7 +157,14 @@ void remove_7_5_kHz(PHY_VARS_eNB *phy_vars_eNB,uint8_t slot) int32_t **rxdata_7_5kHz=phy_vars_eNB->lte_eNB_common_vars.rxdata_7_5kHz[0]; uint16_t len; uint32_t *kHz7_5ptr; +#if defined(__x86_64__) || defined(__i386__) __m128i *rxptr128,*rxptr128_7_5kHz,*kHz7_5ptr128,kHz7_5_2,mmtmp_re,mmtmp_im,mmtmp_re2,mmtmp_im2; +#elif defined(__arm__) + int16x8_t *rxptr128,*kHz7_5ptr128,*rxptr128_7_5kHz; + int32x4_t mmtmp_re,mmtmp_im; + int32x4_t mmtmp0,mmtmp1; + +#endif uint32_t slot_offset,slot_offset2; uint8_t aa; uint32_t i; @@ -199,14 +209,21 @@ void remove_7_5_kHz(PHY_VARS_eNB *phy_vars_eNB,uint8_t slot) for (aa=0; aa<phy_vars_eNB->lte_frame_parms.nb_antennas_rx; aa++) { +#if defined(__x86_64__) || defined(__i386__) rxptr128 = (__m128i *)&rxdata[aa][slot_offset]; rxptr128_7_5kHz = (__m128i *)&rxdata_7_5kHz[aa][slot_offset2]; kHz7_5ptr128 = (__m128i *)kHz7_5ptr; - +#elif defined(__arm__) + rxptr128 = (int16x8_t *)&rxdata[aa][slot_offset]; + rxptr128_7_5kHz = (int16x8_t *)&rxdata_7_5kHz[aa][slot_offset2]; + kHz7_5ptr128 = (int16x8_t *)kHz7_5ptr; +#endif // apply 7.5 kHz // if (((slot>>1)&1) == 0) { // apply the sinusoid from the table directly for (i=0; i<(len>>2); i++) { + +#if defined(__x86_64__) || defined(__i386__) kHz7_5_2 = _mm_sign_epi16(*kHz7_5ptr128,*(__m128i*)&conjugate75_2[0]); mmtmp_re = _mm_madd_epi16(*rxptr128,kHz7_5_2); // Real part of complex multiplication (note: 7_5kHz signal is conjugated for this to work) @@ -223,350 +240,33 @@ void remove_7_5_kHz(PHY_VARS_eNB *phy_vars_eNB,uint8_t slot) rxptr128++; rxptr128_7_5kHz++; kHz7_5ptr128++; - } - } -} - - - -void apply_625_Hz(PHY_VARS_UE *phy_vars_ue,int16_t *prach) -{ - - uint32_t *Hz625ptr; - __m128i *txptr128,*Hz625ptr128,mmtmp_re,mmtmp_im,mmtmp_re2,mmtmp_im2; - uint8_t aa; - uint32_t Ncp,len; - uint32_t i; - LTE_DL_FRAME_PARMS *frame_parms=&phy_vars_ue->lte_frame_parms; - uint8_t frame_type = phy_vars_ue->lte_frame_parms.frame_type; - uint8_t prach_ConfigIndex = phy_vars_ue->lte_frame_parms.prach_config_common.prach_ConfigInfo.prach_ConfigIndex; - uint8_t prach_fmt = get_prach_fmt(prach_ConfigIndex,frame_type); - - switch (prach_fmt) { - case 0: - Ncp = 3168; - break; - - case 1: - case 3: - Ncp = 21024; - break; - - case 2: - Ncp = 6240; - break; - - case 4: - Ncp = 448; - break; - - default: - Ncp = 3168; - break; - } - - switch (frame_parms->N_RB_UL) { - - case 6: - Hz625ptr = (uint32_t*)sig625_1_25MHz; - len = 1536 + (Ncp>>4); - break; - - case 15: - Hz625ptr = (uint32_t*)sig625_2_5MHz; - len = 3072 + (Ncp>>3); - break; - - case 25: - Hz625ptr = (uint32_t*)sig625_5MHz; - len = 6144+(Ncp>>2); - break; - - case 50: - Hz625ptr = (uint32_t*)sig625_10MHz; - len = 12288+(Ncp>>1); - break; - - case 75: - Hz625ptr = (uint32_t*)sig625_15MHz; - len = 18432+((2*Ncp)/3); - break; - - case 100: - Hz625ptr = (uint32_t*)sig625_20MHz; - len = 24576+Ncp; - break; - - default: - Hz625ptr = (uint32_t*)sig625_5MHz; - len = 6144+(Ncp>>2); - break; - } - - for (aa=0; aa<phy_vars_ue->lte_frame_parms.nb_antennas_tx; aa++) { - txptr128 = (__m128i *)prach; - Hz625ptr128 = (__m128i *)Hz625ptr; - // apply 7.5 kHz - - // if (((slot>>1)&1) == 0) { // apply the sinusoid from the table directly - for (i=0; i<(len>>2); i++) { - mmtmp_re = _mm_madd_epi16(*txptr128,*Hz625ptr128); - // Real part of complex multiplication (note: 7_5kHz signal is conjugated for this to work) - mmtmp_im = _mm_shufflelo_epi16(*Hz625ptr128,_MM_SHUFFLE(2,3,0,1)); - mmtmp_im = _mm_shufflehi_epi16(mmtmp_im,_MM_SHUFFLE(2,3,0,1)); - mmtmp_im = _mm_sign_epi16(mmtmp_im,*(__m128i*)&conjugate75[0]); - mmtmp_im = _mm_madd_epi16(mmtmp_im,txptr128[0]); - mmtmp_re = _mm_srai_epi32(mmtmp_re,15); - mmtmp_im = _mm_srai_epi32(mmtmp_im,15); - mmtmp_re2 = _mm_unpacklo_epi32(mmtmp_re,mmtmp_im); - mmtmp_im2 = _mm_unpackhi_epi32(mmtmp_re,mmtmp_im); - /* - printf("%d: (%d,%d) (%d,%d) (%d,%d) (%d,%d) x (%d,%d) (%d,%d) (%d,%d) (%d,%d) => ", - i, - ((short*)txptr128)[0], - ((short*)txptr128)[1], - ((short*)txptr128)[2], - ((short*)txptr128)[3], - ((short*)txptr128)[4], - ((short*)txptr128)[5], - ((short*)txptr128)[6], - ((short*)txptr128)[7], - ((short*)Hz625ptr128)[0], - ((short*)Hz625ptr128)[1], - ((short*)Hz625ptr128)[2], - ((short*)Hz625ptr128)[3], - ((short*)Hz625ptr128)[4], - ((short*)Hz625ptr128)[5], - ((short*)Hz625ptr128)[6], - ((short*)Hz625ptr128)[7]);*/ - - txptr128[0] = _mm_packs_epi32(mmtmp_re2,mmtmp_im2); - /* printf("%(%d,%d) (%d,%d) (%d,%d) (%d,%d)\n", - ((short*)txptr128)[0], - ((short*)txptr128)[1], - ((short*)txptr128)[2], - ((short*)txptr128)[3], - ((short*)txptr128)[4], - ((short*)txptr128)[5], - ((short*)txptr128)[6], - ((short*)txptr128)[7]);*/ - - txptr128++; - Hz625ptr128++; - } - } -} - -void remove_625_Hz(PHY_VARS_eNB *phy_vars_eNB,int16_t *prach) -{ - - uint32_t *Hz625ptr; - __m128i *txptr128,*Hz625ptr128,Hz625_2,mmtmp_re,mmtmp_im,mmtmp_re2,mmtmp_im2; - uint8_t aa; - uint32_t i,Ncp,len; - LTE_DL_FRAME_PARMS *frame_parms=&phy_vars_eNB->lte_frame_parms; - uint8_t frame_type = frame_parms->frame_type; - uint8_t prach_ConfigIndex = frame_parms->prach_config_common.prach_ConfigInfo.prach_ConfigIndex; - uint8_t prach_fmt = get_prach_fmt(prach_ConfigIndex,frame_type); - - switch (prach_fmt) { - case 0: - Ncp = 3168; - break; - - case 1: - case 3: - Ncp = 21024; - break; - - case 2: - Ncp = 6240; - break; - - case 4: - Ncp = 448; - break; - default: - Ncp = 3168; - break; - } - - switch (frame_parms->N_RB_UL) { - - case 6: - Hz625ptr = (uint32_t*)sig625_1_25MHz; - len = 1536 + (Ncp>>4); - break; - - case 15: - Hz625ptr = (uint32_t*)sig625_2_5MHz; - len = 3072 + (Ncp>>3) ; - break; - - case 25: - Hz625ptr = (uint32_t*)sig625_5MHz; - len = 6144+(Ncp>>2); - break; - - case 50: - Hz625ptr = (uint32_t*)sig625_10MHz; - len = 12288+(Ncp>>1); - break; - - case 75: - Hz625ptr = (uint32_t*)sig625_15MHz; - len = 18432+((2*Ncp)/3); - break; - - case 100: - Hz625ptr = (uint32_t*)sig625_20MHz; - len = 24576+Ncp; - break; - - default: - Hz625ptr = (uint32_t*)sig625_5MHz; - len = 11400; - break; - } +#elif defined(__arm__) + + kHz7_5ptr128[0] = vmulq_s16(kHz7_5ptr128[0],((int16x8_t*)conjugate75_2)[0]); + mmtmp0 = vmull_s16(((int16x4_t*)rxptr128)[0],((int16x4_t*)kHz7_5ptr128)[0]); + //mmtmp0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] + mmtmp1 = vmull_s16(((int16x4_t*)rxptr128)[1],((int16x4_t*)kHz7_5ptr128)[1]); + //mmtmp1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] + mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + //mmtmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] + + mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)rxptr128)[0],*(int16x4_t*)conjugate75_2)), ((int16x4_t*)kHz7_5ptr128)[0]); + //mmtmp0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] + mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)rxptr128)[1],*(int16x4_t*)conjugate75_2)), ((int16x4_t*)kHz7_5ptr128)[1]); + //mmtmp1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] + mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)), + vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1))); + //mmtmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] + + rxptr128_7_5kHz[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im)); + rxptr128_7_5kHz++; + rxptr128++; + kHz7_5ptr128++; - for (aa=0; aa<phy_vars_eNB->lte_frame_parms.nb_antennas_tx; aa++) { - txptr128 = (__m128i *)prach; - Hz625ptr128 = (__m128i *)Hz625ptr; - // apply 7.5 kHz - // if (((slot>>1)&1) == 0) { // apply the sinusoid from the table directly - for (i=0; i<(len>>2); i++) { - Hz625_2 = _mm_sign_epi16(*Hz625ptr128,*(__m128i*)&conjugate75_2[0]); - mmtmp_re = _mm_madd_epi16(*txptr128,Hz625_2); - // Real part of complex multiplication (note: 7_5kHz signal is conjugated for this to work) - mmtmp_im = _mm_shufflelo_epi16(Hz625_2,_MM_SHUFFLE(2,3,0,1)); - mmtmp_im = _mm_shufflehi_epi16(mmtmp_im,_MM_SHUFFLE(2,3,0,1)); - mmtmp_im = _mm_sign_epi16(mmtmp_im,*(__m128i*)&conjugate75[0]); - mmtmp_im = _mm_madd_epi16(mmtmp_im,txptr128[0]); - mmtmp_re = _mm_srai_epi32(mmtmp_re,15); - mmtmp_im = _mm_srai_epi32(mmtmp_im,15); - mmtmp_re2 = _mm_unpacklo_epi32(mmtmp_re,mmtmp_im); - mmtmp_im2 = _mm_unpackhi_epi32(mmtmp_re,mmtmp_im); - /* - printf("%d: (%d,%d) (%d,%d) (%d,%d) (%d,%d) x (%d,%d) (%d,%d) (%d,%d) (%d,%d) => ", - i, - ((short*)txptr128)[0], - ((short*)txptr128)[1], - ((short*)txptr128)[2], - ((short*)txptr128)[3], - ((short*)txptr128)[4], - ((short*)txptr128)[5], - ((short*)txptr128)[6], - ((short*)txptr128)[7], - ((short*)Hz625ptr128)[0], - ((short*)Hz625ptr128)[1], - ((short*)Hz625ptr128)[2], - ((short*)Hz625ptr128)[3], - ((short*)Hz625ptr128)[4], - ((short*)Hz625ptr128)[5], - ((short*)Hz625ptr128)[6], - ((short*)Hz625ptr128)[7]);*/ - - txptr128[0] = _mm_packs_epi32(mmtmp_re2,mmtmp_im2); - /* printf("%(%d,%d) (%d,%d) (%d,%d) (%d,%d)\n", - ((short*)txptr128)[0], - ((short*)txptr128)[1], - ((short*)txptr128)[2], - ((short*)txptr128)[3], - ((short*)txptr128)[4], - ((short*)txptr128)[5], - ((short*)txptr128)[6], - ((short*)txptr128)[7]);*/ - - txptr128++; - Hz625ptr128++; +#endif } } } - - -void init_prach625(LTE_DL_FRAME_PARMS *frame_parms) -{ - - uint32_t len,i,Ncp; - double fs; - int16_t *Hz625ptr; - uint8_t frame_type = frame_parms->frame_type; - uint8_t prach_ConfigIndex = frame_parms->prach_config_common.prach_ConfigInfo.prach_ConfigIndex; - uint8_t prach_fmt = get_prach_fmt(prach_ConfigIndex,frame_type); - - switch (prach_fmt) { - case 0: - Ncp = 3168; - break; - - case 1: - case 3: - Ncp = 21024; - break; - - case 2: - Ncp = 6240; - break; - - case 4: - Ncp = 448; - break; - - default: - Ncp = 3168; - break; - } - - switch (frame_parms->N_RB_UL) { - case 6: - len = 1536 + (Ncp>>4); - fs = 1920000.0; - Hz625ptr = sig625_1_25MHz; - break; - - case 15: - len = 3072 + (Ncp>>3) ; - fs = 3840000.0; - Hz625ptr = sig625_2_5MHz; - break; - - case 25: - len = 6144+(Ncp>>2); - fs = 7680000.0; - Hz625ptr = sig625_5MHz; - break; - - case 50: - len = 12288+(Ncp>>1); - fs = 15360000.0; - Hz625ptr = sig625_10MHz; - break; - - case 75: - len = 18432+((2*Ncp)/3); - fs = 23040000.0; - Hz625ptr = sig625_15MHz; - break; - - case 100: - len = 24576+Ncp; - fs = 30720000.0; - Hz625ptr = sig625_20MHz; - break; - - default: - len = 6144+(Ncp>>2); - fs = 7680000.0; - Hz625ptr = sig625_5MHz; - break; - } - - for (i=0; i<len; i++) { - Hz625ptr[i<<1] = (int16_t)floor(32767.0*cos(2*M_PI*625*i/fs)); - Hz625ptr[1+(i<<1)] = (int16_t)floor(32767.0*sin(2*M_PI*625*i/fs)); - // printf("prach625 %d: (%d,%d)\n",i,Hz625ptr[i<<1],Hz625ptr[1+(i<<1)]); - } - -} diff --git a/openair1/PHY/TOOLS/cdot_prod.c b/openair1/PHY/TOOLS/cdot_prod.c index b6a4095ffe..ee8425fd1a 100644 --- a/openair1/PHY/TOOLS/cdot_prod.c +++ b/openair1/PHY/TOOLS/cdot_prod.c @@ -43,10 +43,12 @@ int32_t dot_product(int16_t *x, uint8_t output_shift) { + uint32_t n; + +#if defined(__x86_64__) || defined(__i386__) __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im; __m64 mmtmp7; __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); - uint32_t n; int32_t result; x128 = (__m128i*) x; @@ -113,11 +115,54 @@ int32_t dot_product(int16_t *x, // convert back to integer result = _mm_cvtsi64_si32(mmtmp7); + return(result); + _mm_empty(); _m_empty(); +#elif defined(__arm__) + int16x4_t *x_128=(int16x4_t*)x; + int16x4_t *y_128=(int16x4_t*)y; + int32x4_t tmp_re,tmp_im; + int32x4_t tmp_re1,tmp_im1; + int32x4_t re_cumul,im_cumul; + int32x2_t re_cumul2,im_cumul2; + int32x4_t shift = vdupq_n_s32(-output_shift); + int32x2x2_t result2; + int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; - return(result); + re_cumul = vdupq_n_s32(0); + im_cumul = vdupq_n_s32(0); + + for (n=0; n<(N>>2); n++) { + + tmp_re = vmull_s16(*x_128++, *y_128++); + //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] + tmp_re1 = vmull_s16(*x_128++, *y_128++); + //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] + tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), + vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); + //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] + + tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); + //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] + tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); + //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] + tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), + vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); + //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] + + re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift)); + im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift)); + } + + re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul)); + im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul)); + re_cumul2 = vpadd_s32(re_cumul2,re_cumul2); + im_cumul2 = vpadd_s32(im_cumul2,im_cumul2); + result2 = vzip_s32(re_cumul2,im_cumul2); + return(vget_lane_s32(result2.val[0],0)); +#endif } diff --git a/openair1/PHY/TOOLS/cmult_sv.c b/openair1/PHY/TOOLS/cmult_sv.c index 964ae43fa9..c9d3a8a50e 100644 --- a/openair1/PHY/TOOLS/cmult_sv.c +++ b/openair1/PHY/TOOLS/cmult_sv.c @@ -29,9 +29,27 @@ #include "PHY/sse_intrin.h" #include "defs.h" -#ifndef EXPRESSMIMO_TARGET -static __m128i alpha_128 __attribute__ ((aligned(16))); -static __m128i shift __attribute__ ((aligned(16))); +#if defined(__x86_64__) || defined(__i386__) +#define simd_q15_t __m128i +#define simdshort_q15_t __m64 +#define shiftright_int16(a,shift) _mm_srai_epi16(a,shift) +#define set1_int16(a) _mm_set1_epi16(a) +#define mulhi_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),1) +#define mulhi_s1_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),2) +#define adds_int16(a,b) _mm_adds_epi16(a,b) +#define mullo_int16(a,b) _mm_mullo_epi16(a,b) +#elif defined(__arm__) +#define simd_q15_t int16x8_t +#define simdshort_q15_t int16x4_t +#define shiftright_int16(a,shift) vshrq_n_s16(a,shift) +#define set1_int16(a) vdupq_n_s16(a) +#define mulhi_int16(a,b) vqdmulhq_s16(a,b) +#define mulhi_s1_int16(a,b) vshlq_n_s16(vqdmulhq_s16(a,b),1) +#define adds_int16(a,b) vqaddq_s16(a,b) +#define mullo_int16(a,b) vmulq_s16(a,b) +#define _mm_empty() +#define _m_empty() +#endif void multadd_complex_vector_real_scalar(int16_t *x, @@ -41,19 +59,19 @@ void multadd_complex_vector_real_scalar(int16_t *x, uint32_t N) { - __m128i alpha_128,*x_128=(__m128i*)x,*y_128=(__m128i*)y; + simd_q15_t alpha_128,*x_128=(simd_q15_t *)x,*y_128=(simd_q15_t*)y; int n; - alpha_128 = _mm_set1_epi16(alpha); + alpha_128 = set1_int16(alpha); if (zero_flag == 1) for (n=0; n<N>>2; n++) { - y_128[n] = _mm_slli_epi16(_mm_mulhi_epi16(x_128[n],alpha_128),1); + y_128[n] = mulhi_int16(x_128[n],alpha_128); } else for (n=0; n<N>>2; n++) { - y_128[n] = _mm_adds_epi16(y_128[n],_mm_slli_epi16(_mm_mulhi_epi16(x_128[n],alpha_128),1)); + y_128[n] = adds_int16(y_128[n],mulhi_int16(x_128[n],alpha_128)); } _mm_empty(); @@ -69,32 +87,33 @@ void multadd_real_vector_complex_scalar(int16_t *x, uint32_t i; // do 8 multiplications at a time - __m128i alpha_r_128,alpha_i_128,yr,yi,*x_128=(__m128i*)x,*y_128=(__m128i*)y; + simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x,*y_128=(simd_q15_t*)y; int j; - // printf("alpha = %d,%d\n",alpha[0],alpha[1]); - alpha_r_128 = _mm_set_epi16(alpha[0],alpha[0],alpha[0],alpha[0],alpha[0],alpha[0],alpha[0],alpha[0]); - alpha_i_128 = _mm_set_epi16(alpha[1],alpha[1],alpha[1],alpha[1],alpha[1],alpha[1],alpha[1],alpha[1]); - + alpha_r_128 = set1_int16(alpha[0]); + alpha_i_128 = set1_int16(alpha[1]); j=0; for (i=0; i<N>>3; i++) { - yr = _mm_slli_epi16(_mm_mulhi_epi16(alpha_r_128,x_128[i]),2); - yi = _mm_slli_epi16(_mm_mulhi_epi16(alpha_i_128,x_128[i]),2); - - // print_shorts("yr",&yr); - // print_shorts("yi",&yi); - + yr = mulhi_s1_int16(alpha_r_128,x_128[i]); + yi = mulhi_s1_int16(alpha_i_128,x_128[i]); +#if defined(__x86_64__) || defined(__i386__) y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpacklo_epi16(yr,yi)); - // print_shorts("y",&y_128[j]); j++; y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpackhi_epi16(yr,yi)); - // print_shorts("y",&y_128[j]); j++; - +#elif defined(__arm__) + int16x8x2_t yint; + yint = vzipq_s16(yr,yi); + y_128[j] = adds_int16(y_128[j],yint.val[0]); + j++; + y_128[j] = adds_int16(y_128[j],yint.val[1]); + + j++; +#endif } _mm_empty(); @@ -102,6 +121,7 @@ void multadd_real_vector_complex_scalar(int16_t *x, } +/* int rotate_cpx_vector(int16_t *x, int16_t *alpha, int16_t *y, @@ -127,23 +147,10 @@ int rotate_cpx_vector(int16_t *x, register __m128i m0,m1; - // short *temps; - // int *tempd; - __m128i *x_128; __m128i *y_128; - // __m128i temp; - - /* - msg("rotate_cpx_vector: %x,%x,%x,%d,%d\n", - x, - alpha, - y, - N, - output_shift); - */ shift = _mm_cvtsi32_si128(output_shift); @@ -177,111 +184,44 @@ int rotate_cpx_vector(int16_t *x, // we compute 4 cpx multiply for each loop for(i=0; i<(N>>3); i++) { - // printf("i=%d\n",i); - /* - temps = (short *)x_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)&alpha_128; - printf("alpha : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - */ - m0 = _mm_madd_epi16(x_128[0],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - /* - temp = m0; - - tempd = &temp; - printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - */ m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - /* - temp = m0; - - tempd = (int *)&temp; - printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - */ - m1=m0; m0 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] y_128[0] = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] - - // temps = (int16_t *)&y_128[0]; - // printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - - - m0 = _mm_madd_epi16(x_128[1],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - m1 = m0; m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - y_128[1] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - - m0 = _mm_madd_epi16(x_128[2],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - m1 = m0; m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - y_128[2] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - - m0 = _mm_madd_epi16(x_128[3],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - m1 = m0; m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - y_128[3] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - if (format==1) { // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e - // print_shorts(y_128[0],"y_128[0]="); y_128[0] = _mm_shufflelo_epi16(y_128[0],0x1e); y_128[0] = _mm_shufflehi_epi16(y_128[0],0x1e); ((int16_t*)&y_128[0])[1] = -((int16_t*)&y_128[0])[1]; ((int16_t*)&y_128[0])[5] = -((int16_t*)&y_128[0])[5]; - // print_shorts(y_128[0],"y_128[0]="); - - // print_shorts(y_128[1],"y_128[1]="); y_128[1] = _mm_shufflelo_epi16(y_128[1],0x1e); y_128[1] = _mm_shufflehi_epi16(y_128[1],0x1e); ((int16_t*)&y_128[1])[1] = -((int16_t*)&y_128[1])[1]; ((int16_t*)&y_128[1])[5] = -((int16_t*)&y_128[1])[5]; - // print_shorts(y_128[1],"y_128[1]="); - - // print_shorts(y_128[2],"y_128[2]="); y_128[2] = _mm_shufflelo_epi16(y_128[2],0x1e); y_128[2] = _mm_shufflehi_epi16(y_128[2],0x1e); ((int16_t*)&y_128[2])[1] = -((int16_t*)&y_128[2])[1]; ((int16_t*)&y_128[2])[5] = -((int16_t*)&y_128[2])[5]; - // print_shorts(y_128[2],"y_128[2]="); - - // print_shorts(y_128[3],"y_128[3]="); y_128[3] = _mm_shufflelo_epi16(y_128[3],0x1e); y_128[3] = _mm_shufflehi_epi16(y_128[3],0x1e); ((int16_t*)&y_128[3])[1] = -((int16_t*)&y_128[3])[1]; ((int16_t*)&y_128[3])[5] = -((int16_t*)&y_128[3])[5]; - // print_shorts(y_128[3],"y_128[3]="); - } @@ -326,16 +266,6 @@ int rotate_cpx_vector2(int16_t *x, __m128i *y_128; - /* - printf("rotate_cpx_vector2: %x,%x,%x,%d,%d\n", - x, - alpha, - y, - N, - output_shift); - */ - - shift = _mm_cvtsi32_si128(output_shift); x_128 = (__m128i *)&x[0]; @@ -361,51 +291,22 @@ int rotate_cpx_vector2(int16_t *x, y_128 = (__m128i *)&y[0]; - // _mm_empty(); - // return(0); - // we compute 4 cpx multiply for each loop for(i=0; i<(N>>1); i++) { - // temps = (short *)&x_128[i]; - // printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - // temps = (short *)&alpha_128; - // printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - m0 = _mm_madd_epi16(x_128[i],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - // temp = m0; - - // tempd = &temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - // temp = m0; - - // tempd = (int *)&temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - m1=m0; m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - - - y_128[i] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - if (format==1) { // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e - // print_shorts(y_128[0],"y_128[0]="); y_128[i] = _mm_shufflelo_epi16(y_128[i],0x1e); y_128[i] = _mm_shufflehi_epi16(y_128[i],0x1e); ((int16_t*)&y_128[i])[1] = -((int16_t*)&y_128[i])[1]; ((int16_t*)&y_128[i])[5] = -((int16_t*)&y_128[i])[5]; - // print_shorts(y_128[0],"y_128[0]="); - } - } @@ -415,12 +316,13 @@ int rotate_cpx_vector2(int16_t *x, return(0); } +*/ -int rotate_cpx_vector_norep(int16_t *x, - int16_t *alpha, - int16_t *y, - uint32_t N, - uint16_t output_shift) +int rotate_cpx_vector(int16_t *x, + int16_t *alpha, + int16_t *y, + uint32_t N, + uint16_t output_shift) { // Multiply elementwise two complex vectors of N elements // x - input 1 in the format |Re0 Im0 |,......,|Re(N-1) Im(N-1)| @@ -438,20 +340,13 @@ int rotate_cpx_vector_norep(int16_t *x, uint32_t i; // loop counter - register __m128i m0,m1,m2,m3; - - // int16_t *temps; - // int *tempd; - int *xd; - //__m128i *x_128; - __m128i *y_128; - // __m128i temp; + simd_q15_t *y_128,alpha_128; + int32_t *xd=(int32_t *)x; - - shift = _mm_cvtsi32_si128(output_shift); - xd = (int *) x; - y_128 = (__m128i *) y; +#if defined(__x86_64__) || defined(__i386__) + __m128i shift = _mm_cvtsi32_si128(output_shift); + register simd_q15_t m0,m1,m2,m3; ((int16_t *)&alpha_128)[0] = alpha[0]; ((int16_t *)&alpha_128)[1] = -alpha[1]; @@ -461,43 +356,55 @@ int rotate_cpx_vector_norep(int16_t *x, ((int16_t *)&alpha_128)[5] = -alpha[1]; ((int16_t *)&alpha_128)[6] = alpha[1]; ((int16_t *)&alpha_128)[7] = alpha[0]; +#elif defined(__arm__) + int32x4_t shift; + int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32; + int16_t reflip[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; + int32x4x2_t xtmp; - // _mm_empty(); - // return(0); + ((int16_t *)&alpha_128)[0] = alpha[0]; + ((int16_t *)&alpha_128)[1] = alpha[1]; + ((int16_t *)&alpha_128)[2] = alpha[0]; + ((int16_t *)&alpha_128)[3] = alpha[1]; + ((int16_t *)&alpha_128)[4] = alpha[0]; + ((int16_t *)&alpha_128)[5] = alpha[1]; + ((int16_t *)&alpha_128)[6] = alpha[0]; + ((int16_t *)&alpha_128)[7] = alpha[1]; + int16x8_t bflip = vrev32q_s16(alpha_128); + int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip); + shift = vdupq_n_s32(-output_shift); +#endif + y_128 = (simd_q15_t *) y; - for(i=0; i<N>>2; i++) { + for(i=0; i<N>>2; i++) { +#if defined(__x86_64__) || defined(__i386__) m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]); m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]); - - // printf("i=%d\n",i); - // temps = (short *)x1_128; - // printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - // temps = (short *)x2_128; - // printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im] m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im] - - // temp = m0; - - // tempd = &temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - m2 = _mm_sra_epi32(m2,shift); // shift right by shift in order to compensate for the input amplitude m3 = _mm_sra_epi32(m3,shift); // shift right by shift in order to compensate for the input amplitude - // temp = m0; - - // tempd = (int *)&temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - y_128[0] = _mm_packs_epi32(m2,m3); // pack in 16bit integers with saturation [re im re im re im re im] +#elif defined(__arm__) + + ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]); + ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]); + ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]); + ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]); + re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]), + vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])), + shift); + im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]), + vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])), + shift); - // temps = (short *)&y_128[0]; - // printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); + xtmp = vzipq_s32(re32,im32); + + y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1])); +#endif xd+=4; y_128+=1; @@ -510,7 +417,7 @@ int rotate_cpx_vector_norep(int16_t *x, return(0); } - +/* int mult_vector32_scalar(int16_t *x1, int x2, int16_t *y, @@ -530,16 +437,6 @@ int mult_vector32_scalar(int16_t *x1, uint32_t i; // loop counter - /* - #ifdef USER_MODE - char *tempc; - short *temps; - int *tempd; - long long *templ; - __m128i temp; - #endif - */ - __m128i *x1_128; __m128i x2_128; __m128i *y_128; @@ -553,20 +450,6 @@ int mult_vector32_scalar(int16_t *x1, // we compute 4 cpx multiply for each loop for(i=0; i<(N>>3); i++) { y_128[0] = _mm_mul_epu32(x1_128[0],x2_128); - - /* - #ifdef USER_MODE - printf("i=%d\n",i); - tempd = (int *)x1_128; - printf("x1 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - tempd = (int *)&x2_128; - printf("x2 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - // tempd = (int *)y_128; - // printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - templ = (long long *)y_128; - printf("y : %lld,%lld\n",templ[0],templ[1]); - #endif - */ y_128[1] = _mm_mul_epu32(x1_128[1],x2_128); y_128[2] = _mm_mul_epu32(x1_128[2],x2_128); y_128[3] = _mm_mul_epu32(x1_128[3],x2_128); @@ -582,7 +465,7 @@ int mult_vector32_scalar(int16_t *x1, return(0); } - +*/ int complex_conjugate(int16_t *x1, int16_t *y, @@ -591,46 +474,20 @@ int complex_conjugate(int16_t *x1, { uint32_t i; // loop counter - /* - #ifdef USER_MODE - char *tempc; - short *temps; - int *tempd; - long long *templ; - __m128i temp; - #endif - */ - - __m128i *x1_128; - __m128i x2_128; - __m128i *y_128; - - - x1_128 = (__m128i *)&x1[0]; - x2_128 = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); - y_128 = (__m128i *)&y[0]; + simd_q15_t *x1_128; + simd_q15_t *y_128; + int16_t x2[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; + simd_q15_t *x2_128 = (simd_q15_t*)&x2[0]; + x1_128 = (simd_q15_t *)&x1[0]; + y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>3); i++) { - y_128[0] = _mm_mullo_epi16(x1_128[0],x2_128); - - /* - #ifdef USER_MODE - printf("i=%d\n",i); - tempd = (int *)x1_128; - printf("x1 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - tempd = (int *)&x2_128; - printf("x2 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - // tempd = (int *)y_128; - // printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - templ = (long long *)y_128; - printf("y : %lld,%lld\n",templ[0],templ[1]); - #endif - */ - y_128[1] = _mm_mullo_epi16(x1_128[1],x2_128); - y_128[2] = _mm_mullo_epi16(x1_128[2],x2_128); - y_128[3] = _mm_mullo_epi16(x1_128[3],x2_128); + y_128[0] = mullo_int16(x1_128[0],*x2_128); + y_128[1] = mullo_int16(x1_128[1],*x2_128); + y_128[2] = mullo_int16(x1_128[2],*x2_128); + y_128[3] = mullo_int16(x1_128[3],*x2_128); x1_128+=4; @@ -706,15 +563,3 @@ main () #endif //MAIN -#else //EXPRESSMIMO_TARGET - -int rotate_cpx_vector(int16_t *x, - int16_t *alpha, - int16_t *y, - uint32_t N, - uint16_t output_shift, - uint8_t format) -{ - -} -#endif //EXPRESSMIMO_TARGET diff --git a/openair1/PHY/TOOLS/cmult_vv.c b/openair1/PHY/TOOLS/cmult_vv.c index aa94458943..900d661341 100755 --- a/openair1/PHY/TOOLS/cmult_vv.c +++ b/openair1/PHY/TOOLS/cmult_vv.c @@ -32,485 +32,96 @@ #include <stdio.h> #endif -#ifndef EXPRESSMIMO_TARGET -static __m128i shift __attribute__ ((aligned(16))); -static __m128i m0,m1,m2,m4 __attribute__ ((aligned(16))); - -//#define DEBUG_CMULT - -int mult_cpx_vector(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - int output_shift) -{ - // Multiply elementwise two complex vectors of N elements with repeated formatted output - // x1 - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // We assume x1 with a dinamic of 15 bit maximum - // - // x2 - input 2 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x2 with a dinamic of 14 bit maximum - /// - // y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // - // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; - // - // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) - // WARNING: log2_amp>0 can cause overflow!! - - uint32_t i; // loop counter - - /* - #ifdef USER_MODE - int16_t *temps; - int *tempd; - #endif - */ - __m128i *x1_128; - __m128i *x2_128; - __m128i *y_128; - - // __m128i temp; - - - shift = _mm_cvtsi32_si128(output_shift); - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - y_128 = (__m128i *)&y[0]; - - - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>3); i++) { - - //msg("mult_cpx_vector: iteration %d, x1=%p, x2=%p, y=%p\n",i,x1_128,x2_128,y_128); - /* - #ifdef USER_MODE - printf("i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - #endif - */ - - m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - // temp = m0; - - // tempd = &temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - // temp = m0; - - // tempd = (int *)&temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - y_128[0] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - /* - #ifdef USER_MODE - temps = (int16_t *)&y_128[0]; - printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - #endif USER_MODE - */ - - m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[1] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - - - m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[2] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - - - m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[3] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - - - - - x1_128+=4; - x2_128+=4; - y_128 +=4; - } - - - _mm_empty(); - _m_empty(); - - return(0); -} +#if defined(__x86_64__) || defined(__i386__) +int16_t conjug[8]__attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ; +#define simd_q15_t __m128i +#define simdshort_q15_t __m64 +#elif defined(__arm__) +int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; +#define simd_q15_t int16x8_t +#define simdshort_q15_t int16x4_t +#define _mm_empty() +#define _m_empty() +#endif -int mult_cpx_vector_unprepared(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - int output_shift) +int mult_cpx_conj_vector(int16_t *x1, + int16_t *x2, + int16_t *y, + uint32_t N, + int output_shift) { // Multiply elementwise two complex vectors of N elements with repeated formatted output - // x1 - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| + // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // - // x2 - input 2 in the format |Re0 Im0 Re0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| + // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// - // y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| + // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // - // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) - // WARNING: log2_amp>0 can cause overflow!! + // output_shift - shift to be applied to generate output uint32_t i; // loop counter -#ifdef DEBUG_CMULT - int16_t *temps; - int *tempd; + simd_q15_t *x1_128; + simd_q15_t *x2_128; + simd_q15_t *y_128; +#if defined(__x86_64__) || defined(__i386__) + simd_q15_t tmp_re,tmp_im; + simd_q15_t tmpy0,tmpy1; +#elif defined(__arm__) + int32x4_t tmp_re,tmp_im; + int32x4_t tmp_re1,tmp_im1; + int16x4x2_t tmpy; + int32x4_t shift = vdupq_n_s32(-output_shift); #endif - __m128i *x1_128; - __m128i *x2_128; - __m128i *y_128; - - __m128i shuf_x2; - - - shift = _mm_cvtsi32_si128(output_shift); - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - y_128 = (__m128i *)&y[0]; - - - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>3); i++) { - - //msg("mult_cpx_vector: iteration %d, x1=%p, x2=%p, y=%p\n",i,x1_128,x2_128,y_128); - /* - #ifdef USER_MODE - printf("i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - #endif - */ - - shuf_x2 = _mm_shufflelo_epi16(x2_128[0],_MM_SHUFFLE(2,3,0,1)); - shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,0,1)); - -#ifdef DEBUG_CMULT - - tempd = &shuf_x2; - printf("shuf_x2 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - // temp = m0; - - // tempd = &temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - // temp = m0; - - // tempd = (int *)&temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - y_128[0] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - /* - #ifdef USER_MODE - temps = (int16_t *)&y_128[0]; - printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - #endif USER_MODE - */ - - m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[1] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - - - m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[2] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - - - m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[3] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - - - - - x1_128+=4; - x2_128+=4; - y_128 +=4; - } - - - _mm_empty(); - _m_empty(); - - return(0); -} - -//__attribute__ ((force_align_arg_pointer)) -int mult_cpx_vector_norep(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - int output_shift) -{ - // Multiply elementwise two complex vectors of N elements with normal formatted output - // x1 - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // We assume x1 with a dinamic of 15 bit maximum - // - // x2 - input 2 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x2 with a dinamic of 14 bit maximum - /// - // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| - // - // N - the size f the vectors (this function does N cpx mpy). WARNING: N>=4; - // - // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) - // WARNING: log2_amp>0 can cause overflow!! - - uint32_t i; // loop counter - - //register __m128i m0,m1,m2,m3; - -#ifdef DEBUG_CMULT - __m128i temp; - int *tempd; - int16_t *temps; -#endif //DEBUG_CMULT + x1_128 = (simd_q15_t *)&x1[0]; + x2_128 = (simd_q15_t *)&x2[0]; + y_128 = (simd_q15_t *)&y[0]; - __m128i *x1_128; - __m128i *x2_128; - __m128i *y_128; - - //__m128i temp; - - - shift = _mm_cvtsi32_si128(output_shift); - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - y_128 = (__m128i *)&y[0]; - -#ifndef USER_MODE - //debug_msg("mult_cpx_vector_norep: x1 %p, x2 %p, y %p, shift %d\n",x1,x2,y,output_shift); -#endif - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>3); i++) { - - -#ifdef DEBUG_CMULT - printf("i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); -#endif - - - m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - -#ifdef DEBUG_CMULT - temp = m0; - - tempd = &temp; - printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - -#ifdef DEBUG_CMULT - temp = m0; - - tempd = (int *)&temp; - printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m1 = m0; - - - - m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - -#ifdef DEBUG_CMULT - printf("i=%d\n",i); - temps = (int16_t *)&x1_128[1]; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)&x2_128[1]; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); + for(i=0; i<(N>>2); i++) { + #if defined(__x86_64__) || defined(__i386__) + tmp_re = _mm_madd_epi16(*x1_128,*x2_128); + tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); + tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); + tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]); + tmp_im = _mm_madd_epi16(tmp_im,*x1_128); + tmp_re = _mm_srai_epi32(tmp_re,output_shift); + tmp_im = _mm_srai_epi32(tmp_im,output_shift); + tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); + tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); + *y_128 = _mm_packs_epi32(tmpy0,tmpy1); +#elif defined(__arm__) + + tmp_re = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]); + //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])] + tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]); + //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] + tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), + vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); + //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] + + tmp_im = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]); + //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] + tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]); + //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] + tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), + vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); + //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] + + tmp_re = vqshlq_s32(tmp_re,shift); + tmp_im = vqshlq_s32(tmp_im,shift); + tmpy = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im)); + *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]); #endif - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[1] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[1] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - - m2 = m0; - // m2 = _mm_packs_epi32(m2,m0); // 1- pack in a 128 bit register [re im re im] - - // print_shorts(m2,"m2"); - - y_128[0] = _mm_packs_epi32(m1,m2); // 1- pack in a 128 bit register [re im re im] - - - - m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - -#ifdef DEBUG_CMULT - printf("i=%d\n",i); - temps = (int16_t *)&x1_128[2]; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)&x2_128[2]; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); -#endif //DEBUG_CMULT - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[2] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[2] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m1 = m0; - // m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - - m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - -#ifdef DEBUG_CMULT - printf("i=%d\n",i); - temps = (int16_t *)&x1_128[3]; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)&x2_128[3]; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - - temp = m0; - tempd = (int *)&temp; - printf("m0[3] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[3] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m2 = m0; - // m2 = _mm_packs_epi32(m2,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[1] = _mm_packs_epi32(m1,m2); // 1- pack in a 128 bit register [re im re im] - - - - - - x1_128+=4; - x2_128+=4; - y_128 +=2; + x1_128++; + x2_128++; + y_128++; } @@ -519,1259 +130,3 @@ int mult_cpx_vector_norep(int16_t *x1, return(0); } - - -int mult_cpx_vector_norep_unprepared_conjx2(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - int output_shift) -{ - // Multiply elementwise two complex vectors of N elements with normal formatted output, conjugate x1 - // x1 - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // We assume x1 with a dinamic of 15 bit maximum - // - // x2 - input 2 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x2 with a dinamic of 14 bit maximum - /// - // y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| - // - // N - the size f the vectors (this function does N cpx mpy). WARNING: N>=4; - // - // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) - // WARNING: log2_amp>0 can cause overflow!! - - uint32_t i; // loop counter - - //register __m128i m0,m1,m2,m3; - - short conj_x2s[8] __attribute__((aligned(16))) = {1,1,-1,1,1,1,-1,1}; - __m128i *conj_x2 = (__m128i *)&conj_x2s[0]; - -#ifdef DEBUG_CMULT - __m128i temp; - int *tempd; - int16_t *temps; -#endif //DEBUG_CMULT - - - __m128i *x1_128; - __m128i *x2_128; - __m128i *y_128; - - __m128i shuf_x2; - - - shift = _mm_cvtsi32_si128(output_shift); - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - y_128 = (__m128i *)&y[0]; - - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>3); i++) { - - -#ifdef DEBUG_CMULT - printf("**i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); -#endif - - - shuf_x2 = _mm_shufflelo_epi16(x2_128[0],_MM_SHUFFLE(2,3,1,0)); - shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,1,0)); - shuf_x2 = _mm_sign_epi16(shuf_x2,*conj_x2); -#ifdef DEBUG_CMULT - - temps = &shuf_x2; - printf("shuf_x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); -#endif //DEBUG_CMULT - - m0 = _mm_madd_epi16(x1_128[0],shuf_x2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - -#ifdef DEBUG_CMULT - temp = m0; - - tempd = &temp; - printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - -#ifdef DEBUG_CMULT - temp = m0; - - tempd = (int *)&temp; - printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m1 = m0; - - - - shuf_x2 = _mm_shufflelo_epi16(x2_128[1],_MM_SHUFFLE(2,3,1,0)); - shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,1,0)); - shuf_x2 = _mm_sign_epi16(shuf_x2,*conj_x2); - m0 = _mm_madd_epi16(x1_128[1],shuf_x2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - -#ifdef DEBUG_CMULT - printf("i=%d\n",i); - temps = (int16_t *)&x1_128[1]; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)&x2_128[1]; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); -#endif - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[1] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[1] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - - m2 = m0; - // m2 = _mm_packs_epi32(m2,m0); // 1- pack in a 128 bit register [re im re im] - - // print_shorts(m2,"m2"); - - y_128[0] = _mm_packs_epi32(m1,m2); // 1- pack in a 128 bit register [re im re im] - - - shuf_x2 = _mm_shufflelo_epi16(x2_128[2],_MM_SHUFFLE(2,3,1,0)); - shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,1,0)); - shuf_x2 = _mm_sign_epi16(shuf_x2,*conj_x2); - m0 = _mm_madd_epi16(x1_128[2],shuf_x2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - -#ifdef DEBUG_CMULT - printf("i=%d\n",i); - temps = (int16_t *)&x1_128[2]; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)&x2_128[2]; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); -#endif //DEBUG_CMULT - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[2] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[2] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m1 = m0; - // m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - shuf_x2 = _mm_shufflelo_epi16(x2_128[3],_MM_SHUFFLE(2,3,1,0)); - shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,1,0)); - shuf_x2 = _mm_sign_epi16(shuf_x2,*conj_x2); - - m0 = _mm_madd_epi16(x1_128[3],shuf_x2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - -#ifdef DEBUG_CMULT - printf("i=%d\n",i); - temps = (int16_t *)&x1_128[3]; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)&x2_128[3]; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - - temp = m0; - tempd = (int *)&temp; - printf("m0[3] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - -#ifdef DEBUG_CMULT - temp = m0; - tempd = (int *)&temp; - printf("m0[3] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); -#endif //DEBUG_CMULT - - - m2 = m0; - // m2 = _mm_packs_epi32(m2,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[1] = _mm_packs_epi32(m1,m2); // 1- pack in a 128 bit register [re im re im] - - - - - - x1_128+=4; - x2_128+=4; - y_128 +=2; - } - - - _mm_empty(); - _m_empty(); - - return(0); -} - -static __m128i norep_tmp32 __attribute__ ((aligned(16))); - -//__attribute__ ((force_align_arg_pointer)) -int mult_cpx_vector_norep2(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - int output_shift) -{ - // Multiply elementwise two complex vectors of N elements with normal formatted output and no loop unrollin - // x1 - input 1 in the format |Re0 Im0 Re0 Im0 Re1 Im1 Re1 Im1|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // We assume x1 with a dinamic of 15 bit maximum - // - // x2 - input 2 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x2 with a dinamic of 14 bit maximum - /// - // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| - // - // N - the size f the vectors (this function does N cpx mpy). WARNING: N>=2; - // - // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) - // WARNING: log2_amp>0 can cause overflow!! - - uint32_t i; // loop counter - - //register __m128i m0,m1,m2,m3; - - /* - #ifdef USER_MODE - __m128i temp; - int *tempd; - int16_t *temps; - #endif - */ - - __m128i *x1_128; - __m128i *x2_128; - int *y_32 = (int*)y; - - // __m128i temp; - - - shift = _mm_cvtsi32_si128(output_shift); - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - -#ifndef USER_MODE - //debug_msg("mult_cpx_vector_norep2: x1 %p, x2 %p, y %p, shift %d, N %d\n",x1,x2,y,output_shift,N); -#endif - - // we compute 2 cpx multiply for each loop - for(i=0; i<(N>>1); i++) { - /* - #ifdef USER_MODE - printf("i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - #endif - */ - - m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - /* - temp = m0; - - tempd = &temp; - printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - */ - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - /* - temp = m0; - - tempd = (int *)&temp; - printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - */ - - norep_tmp32 = _mm_packs_epi32(m0,m0); // Re0 Im0 Re1 Im1 Re0 Im0 Re1 Im1 - - /* - #ifdef USER_MODE - printf("tmp : %d,%d,%d,%d\n",((int16_t *)&tmp32)[0],((int16_t *)&tmp32)[1],((int16_t *)&tmp32)[2],((int16_t *)&tmp32)[3]); - #endif - */ - - y_32[0] = ((int *)&norep_tmp32)[0]; // 1- pack in a 128 bit register [re im re im] - y_32[1] = ((int *)&norep_tmp32)[1]; // 1- pack in a 128 bit register [re im re im] - - x1_128+=1; - x2_128+=1; - y_32 +=2; - } - - - _mm_empty(); - _m_empty(); - - return(0); -} - - -int mult_cpx_vector_norep_conj(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - int output_shift) -{ - // Multiply elementwise two complex vectors of N elements after conjugating and shuffling x1 - // x1 - input 1 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x1 with a dynamic of 15 bit maximum - // - // x2 - input 2 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x2 with a dynamic of 14 bit maximum - /// - // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| - // - // N - the size f the vectors (this function does N cpx mpy). WARNING: N>=4; - // - // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) - // WARNING: log2_amp>0 can cause overflow!! - - uint32_t i; // loop counter - - //register __m128i m0,m1,m2,m4; - - /* - #ifdef USER_MODE - int16_t *temps; - int *tempw; - #endif - */ - - __m128i *x1_128; - __m128i *x2_128; - __m128i *y_128; - - // __m128i temp; - - - shift = _mm_cvtsi32_si128(output_shift); - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - y_128 = (__m128i *)&y[0]; - - // printf("mult_cpx_vector_norep: shift %d\n",output_shift); - - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>3); i++) { - - /* - #ifdef USER_MODE - printf("i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - #endif - */ - - m4 = _mm_shufflelo_epi16(x1_128[0],_MM_SHUFFLE(1,0,1,0)); - m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0)); - /* - temps = (int16_t *)&m4; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - */ - m0 = _mm_madd_epi16(m4,x2_128[0]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - // temp = m0; - - // tempd = &temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - /* - tempw = (int *)&m0; - printf("m0[0] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]); - */ - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - /* - tempw = (int *)&m0; - printf("m0[0] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]); - */ - // temp = m0; - - // tempd = (int *)&temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - m1 = m0; - - - - m4 = _mm_shufflelo_epi16(x1_128[1],_MM_SHUFFLE(1,0,1,0)); - m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0)); - m0 = _mm_madd_epi16(m4,x2_128[1]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - /* - tempw = (int *)&m0; - printf("m0[1] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]); - */ - - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - /* - tempw = (int *)&m0; - printf("m0[1] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]); - */ - - - m2 = m0; - // m2 = _mm_packs_epi32(m2,m0); // 1- pack in a 128 bit register [re im re im] - - // print_shorts(m2,"m2"); - - y_128[0] = _mm_packs_epi32(m1,m2); // 1- pack in a 128 bit register [re im re im] - - - - m4 = _mm_shufflelo_epi16(x1_128[2],_MM_SHUFFLE(1,0,1,0)); - m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0)); - m0 = _mm_madd_epi16(m4,x2_128[2]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - /* - tempw = (int *)&m0; - printf("m0[2] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]); - */ - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - /* - tempw = (int *)&m0; - printf("m0[2] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]); - */ - - m1 = m0; - // m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - - - m4 = _mm_shufflelo_epi16(x1_128[3],_MM_SHUFFLE(1,0,1,0)); - m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0)); - m0 = _mm_madd_epi16(m4,x2_128[3]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - /* - tempw = (int *)&m0; - printf("m0[3] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]); - */ - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - /* - tempw = (int *)&m0; - printf("m0[3] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]); - */ - - m2 = m0; - // m2 = _mm_packs_epi32(m2,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[1] = _mm_packs_epi32(m1,m2); // 1- pack in a 128 bit register [re im re im] - - - - - - x1_128+=4; - x2_128+=4; - y_128 +=2; - } - - - _mm_empty(); - _m_empty(); - - return(0); -} - - -int mult_cpx_vector_norep_conj2(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - int output_shift) -{ - // Multiply elementwise two complex vectors of N elements after conjugating and shuffling x1 - // x1 - input 1 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x1 with a dynamic of 15 bit maximum - // - // x2 - input 2 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x2 with a dynamic of 14 bit maximum - /// - // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| - // - // N - the size f the vectors (this function does N cpx mpy). WARNING: N>=2; - // - // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) - // WARNING: log2_amp>0 can cause overflow!! - - uint32_t i; // loop counter - - //register __m128i m0,m1,m2,m4; - __m128i tmp32; - - - - - - - - __m128i *x1_128; - __m128i *x2_128; - int *y_32 = (int *)&y[0]; - - // __m128i temp,*tempd; - - - shift = _mm_cvtsi32_si128(output_shift); - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - - - // printf("mult_cpx_vector_norep: shift %d\n",output_shift); - - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>1); i++) { - - /* - //#ifdef USER_MODE - printf("i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - //#endif - */ - - // conjuaget and shuffle x1 - m4 = _mm_shufflelo_epi16(x1_128[0],_MM_SHUFFLE(1,0,1,0)); - m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0)); - /* - temps = (int16_t *)&m4; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - */ - m0 = _mm_madd_epi16(m4,x2_128[0]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - // tempw = (int *)&m0; - // printf("m0[1] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]); - - - - - tmp32 = _mm_packs_epi32(m0,m0); // Re0 Im0 Re1 Im1 Re0 Im0 Re1 Im1 - - - - // printf("tmp : %d,%d,%d,%d\n",((int16_t *)&tmp32)[0],((int16_t *)&tmp32)[1],((int16_t *)&tmp32)[2],((int16_t *)&tmp32)[3]); - - y_32[0] = ((int *)&tmp32)[0]; // 1- pack in a 128 bit register [re im re im] - y_32[1] = ((int *)&tmp32)[1]; // 1- pack in a 128 bit register [re im re im] - - - - x1_128+=1; - x2_128+=1; - y_32 +=2; - - - } - - - _mm_empty(); - _m_empty(); - - return(0); -} - - -int mult_cpx_vector2(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - int output_shift) -{ - // Multiply elementwise two complex vectors of N elements - // x1 - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // We assume x1 with a dinamic of 15 bit maximum - // - // x2 - input 2 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x2 with a dinamic of 14 bit maximum - // - // y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // - // N - the size f the vectors (this function does N cpx mpy. WARNING: N must be a multiple of 2; - // - // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) - // WARNING: log2_amp>0 can cause overflow!! - - uint32_t i; // loop counter - - //register __m128i m0,m1; - - /* - #ifdef USER_MODE - int16_t *temps; - int *tempd; - #endif - */ - - __m128i *x1_128; - __m128i *x2_128; - __m128i *y_128; - - // __m128i temp; - - - shift = _mm_cvtsi32_si128(output_shift); - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - y_128 = (__m128i *)&y[0]; - - - - for(i=0; i<(N>>1); i++) { - - /* #ifdef USER_MODE */ - /* printf("i=%d\n",i); */ - /* temps = (int16_t *)x1_128; */ - /* printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); */ - /* temps = (int16_t *)x2_128; */ - /* printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); */ - /* #endif */ - - m0 = _mm_madd_epi16(x1_128[i],x2_128[i]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - // temp = m0; - - // tempd = &temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - // temp = m0; - - // tempd = (int *)&temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - y_128[i] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - } - - - _mm_empty(); - _m_empty(); - - return(0); -} - -int mult_cpx_vector_add(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - int output_shift) -{ - // Multiply elementwise two complex vectors of N elements and add it to y - // x1 - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // We assume x1 with a dinamic of 15 bit maximum - // - // x2 - input 2 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x2 with a dinamic of 14 bit maximum - // - // y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // - // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; - // - // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) - // WARNING: log2_amp>0 can cause overflow!! - - uint32_t i; // loop counter - - //register __m128i m0,m1; - - /* - #ifdef USER_MODE - int16_t *temps; - int *tempd; - __m128i temp; - #endif - */ - - __m128i *x1_128; - __m128i *x2_128; - __m128i *y_128; - - - shift = _mm_cvtsi32_si128(output_shift); - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - y_128 = (__m128i *)&y[0]; - - - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>3); i++) { - //unroll 0 - - /* - #ifdef USER_MODE - printf("i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - #endif - */ - - m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - // temp = m0; - - // tempd = &temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - // temp = m0; - - // tempd = (int *)&temp; - // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - - - m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] - m0 = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] - - y_128[0] = _mm_add_epi16(m0,y_128[0]); - - - // temps = (int16_t *)&y_128[0]; - // printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - - - //unroll 1 - m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - m1 = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - y_128[1] = _mm_add_epi16(m1,y_128[1]); - - - //unroll 2 - m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - m1 = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - y_128[2] = _mm_add_epi16(m1,y_128[2]); - - - - //unroll 3 - m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - - m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude - - - - - m1 = m0; - m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im] - m1 = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - y_128[3] = _mm_add_epi16(m1,y_128[3]); - - - x1_128+=4; - x2_128+=4; - y_128 +=4; - } - - - _mm_empty(); - _m_empty(); - - return(0); -} - -int mult_cpx_vector_add32(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N) - -{ - // Multiply elementwise two complex vectors of N elements and add it to y - // x1 - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| - // We assume x1 with a dinamic of 15 bit maximum - // - // x2 - input 2 in the format |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| - // We assume x2 with a dinamic of 14 bit maximum - // - // y - output in the format |Re0 (32bit) Im0 (32bit) |,......,|Re(N-1) (32bit) Im(N-1) (32bit)| - // - // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; - // - - uint32_t i; // loop counter - //register __m128i m0; - - /* - #ifdef USER_MODE - int16_t *temps; - int *tempd; - __m128i temp; - #endif - */ - - __m128i *x1_128; - __m128i *x2_128; - __m128i *y_128; - - - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - y_128 = (__m128i *)&y[0]; - - - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>3); i++) { - //unroll 0 - - - m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0 - y_128[0] = _mm_add_epi32(y_128[0],m0); - - /* - #ifdef USER_MODE - printf("i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - tempd = (int *)y_128; - printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - #endif - */ - - - m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - y_128[1] = _mm_add_epi32(y_128[1],m0); - - m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - y_128[2] = _mm_add_epi32(y_128[2],m0); - - m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] - y_128[3] = _mm_add_epi32(y_128[3],m0); - - - x1_128+=4; - x2_128+=4; - y_128 +=4; - } - - - _mm_empty(); - _m_empty(); - - return(0); -} - -int mult_vector32(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N) - -{ - // Multiply elementwise two real vectors of N elements y = real(x1).*real(x2) - // x1 - input 1 in the format |Re(0) xxx Re(1) xxx|,......,|Re(N-2) xxx Re(N-1) xxx| - // We assume x1 with a dinamic of 31 bit maximum - // - // x1 - input 2 in the format |Re(0) xxx Re(2) xxx|,......,|Re(N-2) xxx Re(N-1) xxx| - // We assume x2 with a dinamic of 31 bit maximum - // - // y - output in the format |Re0 (64bit) |,......,|Re(N-1) (64bit)| - // - // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; - // - - uint32_t i; // loop counter - - __m128i *x1_128; - __m128i *x2_128; - __m128i *y_128; - - - x1_128 = (__m128i *)&x1[0]; - x2_128 = (__m128i *)&x2[0]; - y_128 = (__m128i *)&y[0]; - - - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>3); i++) { - y_128[0] = _mm_mul_epu32(x1_128[0],x2_128[0]); - - /* - #ifdef USER_MODE - printf("i=%d\n",i); - tempd = (int *)x1_128; - printf("x1 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - tempd = (int *)x2_128; - printf("x2 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - // tempd = (int *)y_128; - // printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - templ = (long long *)y_128; - printf("y : %lld,%lld\n",templ[0],templ[1]); - #endif - */ - - y_128[1] = _mm_mul_epu32(x1_128[1],x2_128[1]); - y_128[2] = _mm_mul_epu32(x1_128[2],x2_128[2]); - y_128[3] = _mm_mul_epu32(x1_128[3],x2_128[3]); - - - x1_128+=4; - x2_128+=4; - y_128 +=4; - } - - - _mm_empty(); - _m_empty(); - - return(0); -} - - -/* -The following code does not work, because there is no signed 32bit multiplication intrinsic. It only works for unsigned values - -int mult_cpx_vector32_conj(int16_t *x, - int16_t *y, - uint32_t N) - -{ - // elementwise multiplication of two complex vectors of N elements such that y = x * conj(x) = real(x)*real(x)+imag(x)*imag(x) - // x - input in the format |Re(0) Im(0) Re(1) Im(1) |,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| - // We assume x with a dinamic of 31 bit maximum - // - // y - output in the format |Re0 (64bit) |,......,|Re(N-1) (64bit)| - // - // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; - // - - uint32_t i; // loop counter - -#ifdef USER_MODE - char *tempc; - int16_t *temps; - int *tempd; - long long *templ; - __m128i temp; -#endif - - __m128i *x_128; - __m128i *y_128; - - __m128i m0,m1,m2,m3; - - x_128 = (__m128i *)&x[0]; - y_128 = (__m128i *)&y[0]; - - // we compute 4 cpx multiply for each loop - for(i=0;i<(N>>3);i++) - { - // Re(a)*Re(b) - m0 = _mm_mul_epu32(x_128[0],x_128[0]); - // Im(a)*Im(b) - m1 = _mm_shuffle_epi32(x_128[0],_MM_SHUFFLE(2,3,0,1)); - m3 = _mm_mul_epu32(m1,m1); - // Re(a)*Re(b)+Im(a)*Im(b) - y_128[0] = _mm_add_epi64(m0,m3); - -#ifdef USER_MODE - printf("i=%d\n",i); - tempd = (int *)x_128; - printf("x : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - templ = (long long *)&m0; - printf("m0 : %lld,%lld\n",templ[0],templ[1]); - tempd = (int *)&m1; - printf("m1 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - templ = (long long *)&m3; - printf("m3 : %lld,%lld\n",templ[0],templ[1]); - // tempd = (int *)y_128; - // printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); - templ = (long long *)y_128; - printf("y : %lld,%lld\n",templ[0],templ[1]); -#endif - - // Re(a)*Re(b) - m0 = _mm_mul_epu32(x_128[1],x_128[1]); - // Im(a)*Im(b) - m1 = _mm_shuffle_epi32(x_128[1],_MM_SHUFFLE(1,0,3,2)); - m3 = _mm_mul_epu32(m1,m1); - // Re(a)*Re(b)+Im(a)*Im(b) - y_128[1] = _mm_add_epi64(m0,m3); - - - // Re(a)*Re(b) - m0 = _mm_mul_epu32(x_128[2],x_128[2]); - // Im(a)*Im(b) - m1 = _mm_shuffle_epi32(x_128[2],_MM_SHUFFLE(1,0,3,2)); - m3 = _mm_mul_epu32(m1,m1); - // Re(a)*Re(b)+Im(a)*Im(b) - y_128[2] = _mm_add_epi64(m0,m3); - - - // Re(a)*Re(b) - m0 = _mm_mul_epu32(x_128[3],x_128[3]); - // Im(a)*Im(b) - m1 = _mm_shuffle_epi32(x_128[3],_MM_SHUFFLE(1,0,3,2)); - m3 = _mm_mul_epu32(m1,m1); - // Re(a)*Re(b)+Im(a)*Im(b) - y_128[3] = _mm_add_epi64(m0,m3); - - - x_128+=4; - y_128 +=4; - } - - - _mm_empty(); - _m_empty(); - - return(0); -} -*/ - -int mult_cpx_vector32_conj(int16_t *x, - int16_t *y, - uint32_t N) - -{ - // Elementwise multiplication of two complex vectors of N elements such that y = x * conj(x) = real(x)*real(x)+imag(x)*imag(x) - // x - input in the format |Re(0) Im(0) Re(1) Im(1) |,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| - // We assume x1 with a dinamic of 31 bit maximum - // - // y - output in the format |Re0 (64bit) |,......,|Re(N-1) (64bit)| - // - // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; - // - - uint32_t i; // loop counter - - int *xi = (int*) x; - long long int *yl = (long long int*) y; - - long long int temp1,temp2; - - - for(i=0; i<N/2; i++) { - // Re(a)*Re(b) - temp1 = ((long long int) xi[0])* ((long long int) xi[0]); - // Im(a)*Im(b) - temp2 = ((long long int) xi[1])* ((long long int) xi[1]); - yl[0] = temp1+temp2; - - temp1 = ((long long int) xi[2])* ((long long int) xi[2]); - temp2 = ((long long int) xi[3])* ((long long int) xi[3]); - yl[1] = temp1+temp2; - - /* - #ifdef USER_MODE - printf("i=%d\n",i); - printf("x1 : %d,%d,%d,%d\n",x1i[0],x1i[1],x1i[2],x1i[3]); - printf("x2 : %d,%d,%d,%d\n",x2i[0],x2i[1],x2i[2],x2i[3]); - printf("temp : %lld,%lld\n",temp1,temp2); - printf("y : %lld,%lld\n",yl[0],yl[1]); - #endif - */ - - xi+=4; - yl +=2; - } - - return(0); -} - - -int shift_and_pack(int16_t *y, - uint32_t N, - int output_shift) -{ - uint32_t i; // loop counter - - //register __m128i m0,m1; - - /* - #ifdef USER_MODE - int16_t *temps; - int *tempd; - __m128i temp; - #endif - */ - - __m128i *y_128; - - - shift = _mm_cvtsi32_si128(output_shift); - y_128 = (__m128i *)&y[0]; - - - // we compute 4 cpx multiply for each loop - for(i=0; i<(N>>3); i++) { - /* - #ifdef USER_MODE - printf("i=%d\n",i); - temps = (int16_t *)x1_128; - printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - temps = (int16_t *)x2_128; - printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - #endif - */ - - //unroll 0 - m0 = _mm_sra_epi32(y_128[0],shift); // 1- shift right by shift in order to compensate for the input amplitude - m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] - y_128[0] = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] - - // temps = (int16_t *)&y_128[0]; - // printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); - - //unroll 1 - m1 = _mm_sra_epi32(y_128[1],shift); // 1- shift right by shift in order to compensate for the input amplitude - m1 = _mm_packs_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - y_128[1] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - //unroll 2 - m1 = _mm_sra_epi32(y_128[2],shift); // 1- shift right by shift in order to compensate for the input amplitude - m1 = _mm_packs_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - y_128[2] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - //unroll 3 - m1 = _mm_sra_epi32(y_128[3],shift); // 1- shift right by shift in order to compensate for the input amplitude - m1 = _mm_packs_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - y_128[3] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im] - - y_128 +=4; - } - - _mm_empty(); - _m_empty(); - - return(0); -} - - -#ifdef MAIN -#define L 16 - -main () -{ - - int16_t input[256] __attribute__((aligned(16))); - int16_t input2[256] __attribute__((aligned(16))); - int16_t output[256] __attribute__((aligned(16))); - - int i; - - input[0] = 100; - input[1] = 200; - input[2] = -200; - input[3] = 100; - input[4] = 1000; - input[5] = 2000; - input[6] = -2000; - input[7] = 1000; - input[8] = 100; - input[9] = 200; - input[10] = -200; - input[11] = 100; - input[12] = 1000; - input[13] = 2000; - input[14] = -2000; - input[15] = 1000; - - input2[0] = 1; - input2[1] = 2; - input2[2] = 1; - input2[3] = 2; - input2[4] = 10; - input2[5] = 20; - input2[6] = 10; - input2[7] = 20; - input2[8] = 1; - input2[9] = 2; - input2[10] = 1; - input2[11] = 2; - input2[12] = 1000; - input2[13] = 2000; - input2[14] = 1000; - input2[15] = 2000; - - - mult_cpx_vector32_conj(input,output,8); - - -} - -#endif //MAIN - - -#else //EXPRESSMIMO_TARGET - -/* -int mult_cpx_vector(int16_t *x1, - int16_t *x2, - int16_t *y, - uint32_t N, - uint16_t output_shift) -{ - -} -*/ - -#endif //EXPRESSMIMO_TARGET diff --git a/openair1/PHY/TOOLS/defs.h b/openair1/PHY/TOOLS/defs.h index d9783041e4..7d8942dd72 100644 --- a/openair1/PHY/TOOLS/defs.h +++ b/openair1/PHY/TOOLS/defs.h @@ -339,14 +339,13 @@ void dft3072(int16_t *sigF,int16_t *sig); void dft24576(int16_t *sigF,int16_t *sig); -/*!\fn int rotate_cpx_vector(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N,uint16_t output_shift, uint8_t format) +/*!\fn int32_t rotate_cpx_vector(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N,uint16_t output_shift) This function performs componentwise multiplication of a vector with a complex scalar. -@param x Vector input (Q1.15) in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| +@param x Vector input (Q1.15) in the format |Re0 Im0|,......,|Re(N-1) Im(N-1)| @param alpha Scalar input (Q1.15) in the format |Re0 Im0| -@param y Output (Q1.15) in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| +@param y Output (Q1.15) in the format |Re0 Im0|,......,|Re(N-1) Im(N-1)| @param N Length of x WARNING: N>=4 @param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!! -@param format Format 0 indicates that alpha is in shuffled format during multiply (Re -Im Im Re), whereas 1 indicates that input is in this format (i.e. a matched filter) The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$ */ @@ -354,49 +353,15 @@ int32_t rotate_cpx_vector(int16_t *x, int16_t *alpha, int16_t *y, uint32_t N, - uint16_t output_shift, - uint8_t format); - -/*!\fn int32_t rotate_cpx_vector2(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N,uint16_t output_shift,uint8_t format) -This function performs componentwise multiplication of a vector with a complex scalar. -@param x Vector input (Q1.15) in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| -@param alpha Scalar input (Q1.15) in the format |Re0 Im0| -@param y Output (Q1.15) in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| -@param N Length of x WARNING: N must be multiple of 2 (the routine performs two complex multiplies per cycle) -@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!! -@param format Format 0 indicates that alpha is in shuffled format during multiply (Re -Im Im Re), whereas 1 indicates that input is in this format (i.e. a matched filter) -The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$ -*/ -int32_t rotate_cpx_vector2(int16_t *x, - int16_t *alpha, - int16_t *y, - uint32_t N, - uint16_t output_shift, - uint8_t format); - -/*!\fn int32_t rotate_cpx_vector_norep(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N,uint16_t output_shift) -This function performs componentwise multiplication of a vector with a complex scalar. -@param x Vector input (Q1.15) in the format |Re0 Im0|,......,|Re(N-1) Im(N-1)| -@param alpha Scalar input (Q1.15) in the format |Re0 Im0| -@param y Output (Q1.15) in the format |Re0 Im0|,......,|Re(N-1) Im(N-1)| -@param N Length of x WARNING: N>=4 -@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!! - -The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$ -*/ -int32_t rotate_cpx_vector_norep(int16_t *x, - int16_t *alpha, - int16_t *y, - uint32_t N, - uint16_t output_shift); + uint16_t output_shift); /*!\fn int32_t add_cpx_vector(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N) This function performs componentwise addition of a vector with a complex scalar. -@param x Vector input (Q1.15) in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| +@param x Vector input (Q1.15) in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| @param alpha Scalar input (Q1.15) in the format |Re0 Im0| -@param y Output (Q1.15) in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| +@param y Output (Q1.15) in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| @param N Length of x WARNING: N>=4 The function implemented is : \f$\mathbf{y} = \alpha + \mathbf{x}\f$ diff --git a/openair1/PHY/TOOLS/lte_dfts.c b/openair1/PHY/TOOLS/lte_dfts.c index 6a79a9bd3a..f2eb3f0e65 100644 --- a/openair1/PHY/TOOLS/lte_dfts.c +++ b/openair1/PHY/TOOLS/lte_dfts.c @@ -40,6 +40,7 @@ #include "defs.h" #else #include "time_meas.h" +#include <math.h> #define debug_msg #define ONE_OVER_SQRT2_Q15 23170 @@ -49,12 +50,15 @@ #include "PHY/sse_intrin.h" +#define print_shorts(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]) +#define print_ints(s,x) printf("%s %d %d %d %d\n",s,(x)[0],(x)[1],(x)[2],(x)[3]) static int16_t conjugatedft[8] __attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ; static short reflip[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; +#if defined(__x86_64__) || defined(__i386__) static inline void cmac(__m128i a,__m128i b, __m128i *re32, __m128i *im32) __attribute__((always_inline)); static inline void cmac(__m128i a,__m128i b, __m128i *re32, __m128i *im32) { @@ -122,8 +126,6 @@ static inline void cmultc(__m128i a,__m128i b, __m128i *re32, __m128i *im32) *re32 = _mm_madd_epi16(a,b); mmtmpb = _mm_sign_epi16(b,*(__m128i*)reflip); - // mmtmpb = _mm_shufflelo_epi16(mmtmpb,_MM_SHUFFLE(2,3,0,1)); - // mmtmpb = _mm_shufflehi_epi16(mmtmpb,_MM_SHUFFLE(2,3,0,1)); mmtmpb = _mm_shuffle_epi8(mmtmpb,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); *im32 = _mm_madd_epi16(a,mmtmpb); @@ -178,34 +180,166 @@ static inline __m128i packed_cmult2(__m128i a,__m128i b,__m128i b2) cre = _mm_madd_epi16(a,b); cim = _mm_madd_epi16(a,b2); - /* - mmtmpb = _mm_sign_epi16(b,*(__m128i*)reflip); - cre = _mm_madd_epi16(a,mmtmpb); - mmtmpb = _mm_shufflelo_epi16(b,_MM_SHUFFLE(2,3,0,1)); - mmtmpb = _mm_shufflehi_epi16(mmtmpb,_MM_SHUFFLE(2,3,0,1)); - cim = _mm_madd_epi16(a,mmtmpb); - */ - /* - __m128i cre,cim; - cmult(a,b,&cre,&cim); - */ return(cpack(cre,cim)); } -/* -static inline __m128i packed_cmultc2(__m128i a,__m128i b,__m128i b2) __attribute__((always_inline)); +#elif defined (__arm__) +static inline void cmac(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) __attribute__((always_inline)); +static inline void cmac(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) +{ -static inline __m128i packed_cmultc2(__m128i a,__m128i b,__m128i b2) { + + int32x4_t ab_re0,ab_re1,ab_im0,ab_im1; + int16x8_t bflip = vrev32q_s16(b); + int16x8_t bconj = vmulq_s16(b,*(int16x8_t *)reflip); + + ab_re0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&bconj)[0]); + ab_re1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&bconj)[1]); + ab_im0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&bflip)[0]); + ab_im1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&bflip)[1]); + *re32 = vqaddq_s32(*re32,vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]), + vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1]))); + *im32 = vqaddq_s32(*im32,vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]), + vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1]))); +} - __m128i cre,cim; +static inline void cmacc(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) __attribute__((always_inline)); +static inline void cmacc(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) +{ + int32x4_t ab_re0,ab_re1,ab_im0,ab_im1; + int16x8_t bconj = vmulq_s16(b,*(int16x8_t *)reflip); + int16x8_t bflip = vrev32q_s16(bconj); + + ab_re0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&b)[0]); + ab_re1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&b)[1]); + ab_im0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&bflip)[0]); + ab_im1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&bflip)[1]); + *re32 = vqaddq_s32(*re32,vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]), + vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1]))); + *im32 = vqaddq_s32(*im32,vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]), + vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1]))); + +} + +static inline void cmult(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) __attribute__((always_inline)); +static inline void cmult(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) +{ + int32x4_t ab_re0,ab_re1,ab_im0,ab_im1; + int16x8_t bflip = vrev32q_s16(b); + int16x8_t bconj = vmulq_s16(b,*(int16x8_t *)reflip); + int16x4_t al,ah,bcl,bch,bfl,bfh; + int32x2_t abr0l,abr0h,abr1l,abr1h,abi0l,abi0h,abi1l,abi1h; + + al = vget_low_s16(a); ah = vget_high_s16(a); + bcl = vget_low_s16(bconj); bch = vget_high_s16(bconj); + bfl = vget_low_s16(bflip); bfh = vget_high_s16(bflip); + + ab_re0 = vmull_s16(al,bcl); + ab_re1 = vmull_s16(ah,bch); + ab_im0 = vmull_s16(al,bfl); + ab_im1 = vmull_s16(ah,bfh); + abr0l = vget_low_s32(ab_re0); abr0h = vget_high_s32(ab_re0); + abr1l = vget_low_s32(ab_re1); abr1h = vget_high_s32(ab_re1); + abi0l = vget_low_s32(ab_im0); abi0h = vget_high_s32(ab_im0); + abi1l = vget_low_s32(ab_im1); abi1h = vget_high_s32(ab_im1); + + *re32 = vcombine_s32(vpadd_s32(abr0l,abr0h), + vpadd_s32(abr1l,abr1h)); + *im32 = vcombine_s32(vpadd_s32(abi0l,abi0h), + vpadd_s32(abi1l,abi1h)); +} + +static inline void cmultc(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) __attribute__((always_inline)); + +static inline void cmultc(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) +{ + int32x4_t ab_re0,ab_re1,ab_im0,ab_im1; + int16x8_t bconj = vmulq_s16(b,*(int16x8_t *)reflip); + int16x8_t bflip = vrev32q_s16(bconj); + int16x4_t al,ah,bl,bh,bfl,bfh; + int32x2_t abr0l,abr0h,abr1l,abr1h,abi0l,abi0h,abi1l,abi1h; + al = vget_low_s16(a); ah = vget_high_s16(a); + bl = vget_low_s16(b); bh = vget_high_s16(b); + bfl = vget_low_s16(bflip); bfh = vget_high_s16(bflip); + + ab_re0 = vmull_s16(al,bl); + ab_re1 = vmull_s16(ah,bh); + ab_im0 = vmull_s16(al,bfl); + ab_im1 = vmull_s16(ah,bfh); + + abr0l = vget_low_s32(ab_re0); abr0h = vget_high_s32(ab_re0); + abr1l = vget_low_s32(ab_re1); abr1h = vget_high_s32(ab_re1); + abi0l = vget_low_s32(ab_im0); abi0h = vget_high_s32(ab_im0); + abi1l = vget_low_s32(ab_im1); abi1h = vget_high_s32(ab_im1); + + *re32 = vcombine_s32(vpadd_s32(abr0l,abr0h), + vpadd_s32(abr1l,abr1h)); + *im32 = vcombine_s32(vpadd_s32(abi0l,abi0h), + vpadd_s32(abi1l,abi1h)); + +} + + +static inline int16x8_t cpack(int32x4_t xre,int32x4_t xim) __attribute__((always_inline)); + +static inline int16x8_t cpack(int32x4_t xre,int32x4_t xim) +{ + int32x4x2_t xtmp; + + xtmp = vzipq_s32(xre,xim); + return(vcombine_s16(vqshrn_n_s32(xtmp.val[0],15),vqshrn_n_s32(xtmp.val[1],15))); + +} + + +static inline void packed_cmult(int16x8_t a,int16x8_t b, int16x8_t *c) __attribute__((always_inline)); + +static inline void packed_cmult(int16x8_t a,int16x8_t b, int16x8_t *c) +{ + + int32x4_t cre,cim; + cmult(a,b,&cre,&cim); + *c = cpack(cre,cim); + +} + + +static inline void packed_cmultc(int16x8_t a,int16x8_t b, int16x8_t *c) __attribute__((always_inline)); + +static inline void packed_cmultc(int16x8_t a,int16x8_t b, int16x8_t *c) +{ + + int32x4_t cre,cim; cmultc(a,b,&cre,&cim); + *c = cpack(cre,cim); + +} + +static inline int16x8_t packed_cmult2(int16x8_t a,int16x8_t b, int16x8_t b2) __attribute__((always_inline)); + +static inline int16x8_t packed_cmult2(int16x8_t a,int16x8_t b, int16x8_t b2) +{ + + + + int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,cre,cim; + + ab_re0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&b)[0]); + ab_re1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&b)[1]); + ab_im0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&b2)[0]); + ab_im1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&b2)[1]); + cre = vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]), + vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])); + cim = vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]), + vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])); return(cpack(cre,cim)); } -*/ + +#endif static int16_t W0s[8]__attribute__((aligned(16))) = {32767,0,32767,0,32767,0,32767,0}; @@ -217,6 +351,7 @@ static int16_t W25s[8]__attribute__((aligned(16))) = {-26509,-19260,-26509,-1926 static int16_t W35s[8]__attribute__((aligned(16))) = {-26510,19260,-26510,19260,-26510,19260,-26510,19260}; static int16_t W45s[8]__attribute__((aligned(16))) = {10126,31163,10126,31163,10126,31163,10126,31163}; +#if defined(__x86_64__) || defined(__i386__) __m128i *W0 = (__m128i *)W0s; __m128i *W13 = (__m128i *)W13s; __m128i *W23 = (__m128i *)W23s; @@ -224,7 +359,15 @@ __m128i *W15 = (__m128i *)W15s; __m128i *W25 = (__m128i *)W25s; __m128i *W35 = (__m128i *)W35s; __m128i *W45 = (__m128i *)W45s; - +#elif defined(__arm__) +int16x8_t *W0 = (int16x8_t *)W0s; +int16x8_t *W13 = (int16x8_t *)W13s; +int16x8_t *W23 = (int16x8_t *)W23s; +int16x8_t *W15 = (int16x8_t *)W15s; +int16x8_t *W25 = (int16x8_t *)W25s; +int16x8_t *W35 = (int16x8_t *)W35s; +int16x8_t *W45 = (int16x8_t *)W45s; +#endif static int16_t dft_norm_table[16] = {9459, //12 6689,//24 5461,//36 @@ -244,6 +387,7 @@ static int16_t dft_norm_table[16] = {9459, //12 }; //sqrt(5) //300 +#if defined(__x86_64__) || defined(__i386__) static inline void bfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m128i *tw)__attribute__((always_inline)); static inline void bfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m128i *tw) @@ -270,6 +414,31 @@ static inline void bfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m12 *y1 = _mm_packs_epi32(bfly2_tmp1,bfly2_tmp2); } +#elif defined(__arm__) + +static inline void bfly2(int16x8_t *x0, int16x8_t *x1,int16x8_t *y0, int16x8_t *y1,int16x8_t *tw)__attribute__((always_inline)); + +static inline void bfly2(int16x8_t *x0, int16x8_t *x1,int16x8_t *y0, int16x8_t *y1,int16x8_t *tw) +{ + + int32x4_t x0r_2,x0i_2,x1r_2,x1i_2,dy0r,dy1r,dy0i,dy1i; + + cmult(*(x0),*(W0),&x0r_2,&x0i_2); + cmult(*(x1),*(tw),&x1r_2,&x1i_2); + + dy0r = vqaddq_s32(x0r_2,x1r_2); + dy1r = vqsubq_s32(x0r_2,x1r_2); + dy0i = vqaddq_s32(x0i_2,x1i_2); + dy1i = vqsubq_s32(x0i_2,x1i_2); + + *y0 = cpack(dy0r,dy0i); + *y1 = cpack(dy1r,dy1i); +} + + +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void bfly2_tw1(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1)__attribute__((always_inline)); static inline void bfly2_tw1(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1) @@ -280,6 +449,20 @@ static inline void bfly2_tw1(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1) } +#elif defined(__arm__) + +static inline void bfly2_tw1(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8_t *y1)__attribute__((always_inline)); + +static inline void bfly2_tw1(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8_t *y1) +{ + + *y0 = vqaddq_s16(*x0,*x1); + *y1 = vqsubq_s16(*x0,*x1); + +} +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void bfly2_16(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1, __m128i *tw, __m128i *twb)__attribute__((always_inline)); static inline void bfly2_16(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1, __m128i *tw, __m128i *twb) @@ -295,6 +478,25 @@ static inline void bfly2_16(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1, } +#elif defined(__arm__) + +static inline void bfly2_16(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8_t *y1, int16x8_t *tw, int16x8_t *twb)__attribute__((always_inline)); + +static inline void bfly2_16(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8_t *y1, int16x8_t *tw, int16x8_t *twb) +{ + + register int16x8_t x1t; + + x1t = packed_cmult2(*(x1),*(tw),*(twb)); + + *y0 = vqaddq_s16(*x0,x1t); + *y1 = vqsubq_s16(*x0,x1t); + +} + +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void ibfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m128i *tw)__attribute__((always_inline)); static inline void ibfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m128i *tw) @@ -320,10 +522,34 @@ static inline void ibfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m1 bfly2_tmp2 = _mm_unpackhi_epi32(dy1r,dy1i); *y1 = _mm_packs_epi32(bfly2_tmp1,bfly2_tmp2); } +#elif defined(__arm__) +static inline void ibfly2(int16x8_t *x0, int16x8_t *x1,int16x8_t *y0, int16x8_t *y1,int16x8_t *tw) +{ + + int32x4_t x0r_2,x0i_2,x1r_2,x1i_2,dy0r,dy1r,dy0i,dy1i; + + cmultc(*(x0),*(W0),&x0r_2,&x0i_2); + cmultc(*(x1),*(tw),&x1r_2,&x1i_2); + + dy0r = vqaddq_s32(x0r_2,x1r_2); + dy1r = vqsubq_s32(x0r_2,x1r_2); + dy0i = vqaddq_s32(x0i_2,x1i_2); + dy1i = vqsubq_s32(x0i_2,x1i_2); + + *y0 = cpack(dy0r,dy0i); + *y1 = cpack(dy1r,dy1i); + +} + +#endif + // This is the radix-3 butterfly (fft) + +#if defined(__x86_64__) || defined(__i386__) + static inline void bfly3(__m128i *x0,__m128i *x1,__m128i *x2, __m128i *y0,__m128i *y1,__m128i *y2, __m128i *tw1,__m128i *tw2) __attribute__((always_inline)); @@ -348,6 +574,35 @@ static inline void bfly3(__m128i *x0,__m128i *x1,__m128i *x2, *(y2) = _mm_adds_epi16(*(x0),*(y2)); } +#elif defined(__arm__) +static inline void bfly3(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2, + int16x8_t *tw1,int16x8_t *tw2) __attribute__((always_inline)); + +static inline void bfly3(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2, + int16x8_t *tw1,int16x8_t *tw2) +{ + + int32x4_t tmpre,tmpim; + int16x8_t x1_2,x2_2; + + packed_cmult(*(x1),*(tw1),&x1_2); + packed_cmult(*(x2),*(tw2),&x2_2); + *(y0) = vqaddq_s16(*(x0),vqaddq_s16(x1_2,x2_2)); + cmult(x1_2,*(W13),&tmpre,&tmpim); + cmac(x2_2,*(W23),&tmpre,&tmpim); + *(y1) = cpack(tmpre,tmpim); + *(y1) = vqaddq_s16(*(x0),*(y1)); + cmult(x1_2,*(W23),&tmpre,&tmpim); + cmac(x2_2,*(W13),&tmpre,&tmpim); + *(y2) = cpack(tmpre,tmpim); + *(y2) = vqaddq_s16(*(x0),*(y2)); +} + +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void ibfly3(__m128i *x0,__m128i *x1,__m128i *x2, __m128i *y0,__m128i *y1,__m128i *y2, __m128i *tw1,__m128i *tw2) __attribute__((always_inline)); @@ -372,6 +627,34 @@ static inline void ibfly3(__m128i *x0,__m128i *x1,__m128i *x2, *(y2) = _mm_adds_epi16(*(x0),*(y2)); } +#elif defined(__arm__) +static inline void ibfly3(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2, + int16x8_t *tw1,int16x8_t *tw2) __attribute__((always_inline)); + +static inline void ibfly3(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2, + int16x8_t *tw1,int16x8_t *tw2) +{ + + int32x4_t tmpre,tmpim; + int16x8_t x1_2,x2_2; + + packed_cmultc(*(x1),*(tw1),&x1_2); + packed_cmultc(*(x2),*(tw2),&x2_2); + *(y0) = vqaddq_s16(*(x0),vqaddq_s16(x1_2,x2_2)); + cmultc(x1_2,*(W13),&tmpre,&tmpim); + cmacc(x2_2,*(W23),&tmpre,&tmpim); + *(y1) = cpack(tmpre,tmpim); + *(y1) = vqaddq_s16(*(x0),*(y1)); + cmultc(x1_2,*(W23),&tmpre,&tmpim); + cmacc(x2_2,*(W13),&tmpre,&tmpim); + *(y2) = cpack(tmpre,tmpim); + *(y2) = vqaddq_s16(*(x0),*(y2)); +} +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void bfly3_tw1(__m128i *x0,__m128i *x1,__m128i *x2, __m128i *y0,__m128i *y1,__m128i *y2) __attribute__((always_inline)); @@ -391,8 +674,31 @@ static inline void bfly3_tw1(__m128i *x0,__m128i *x1,__m128i *x2, *(y2) = cpack(tmpre,tmpim); *(y2) = _mm_adds_epi16(*(x0),*(y2)); } +#elif defined(__arm__) +static inline void bfly3_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2) __attribute__((always_inline)); + +static inline void bfly3_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2) +{ + + int32x4_t tmpre,tmpim; + + *(y0) = vqaddq_s16(*(x0),vqaddq_s16(*(x1),*(x2))); + cmult(*(x1),*(W13),&tmpre,&tmpim); + cmac(*(x2),*(W23),&tmpre,&tmpim); + *(y1) = cpack(tmpre,tmpim); + *(y1) = vqaddq_s16(*(x0),*(y1)); + cmult(*(x1),*(W23),&tmpre,&tmpim); + cmac(*(x2),*(W13),&tmpre,&tmpim); + *(y2) = cpack(tmpre,tmpim); + *(y2) = vqaddq_s16(*(x0),*(y2)); + +} +#endif +#if defined(__x86_64__) || defined(__i386__) static inline void bfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3, __m128i *tw1,__m128i *tw2,__m128i *tw3)__attribute__((always_inline)); @@ -434,6 +740,51 @@ static inline void bfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, *(y3) = _mm_add_epi16(*(x0),cpack(dy3r,dy3i)); } +#elif defined(__arm__) +static inline void bfly4(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3)__attribute__((always_inline)); + +static inline void bfly4(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3) +{ + + int32x4_t x1r_2,x1i_2,x2r_2,x2i_2,x3r_2,x3i_2,dy0r,dy0i,dy1r,dy1i,dy2r,dy2i,dy3r,dy3i; + + // cmult(*(x0),*(W0),&x0r_2,&x0i_2); + cmult(*(x1),*(tw1),&x1r_2,&x1i_2); + cmult(*(x2),*(tw2),&x2r_2,&x2i_2); + cmult(*(x3),*(tw3),&x3r_2,&x3i_2); + // dy0r = _mm_add_epi32(x0r_2,_mm_add_epi32(x1r_2,_mm_add_epi32(x2r_2,x3r_2))); + // dy0i = _mm_add_epi32(x0i_2,_mm_add_epi32(x1i_2,_mm_add_epi32(x2i_2,x3i_2))); + // *(y0) = cpack(dy0r,dy0i); + dy0r = vqaddq_s32(x1r_2,vqaddq_s32(x2r_2,x3r_2)); + dy0i = vqaddq_s32(x1i_2,vqaddq_s32(x2i_2,x3i_2)); + *(y0) = vqaddq_s16(*(x0),cpack(dy0r,dy0i)); + // dy1r = _mm_add_epi32(x0r_2,_mm_sub_epi32(x1i_2,_mm_add_epi32(x2r_2,x3i_2))); + // dy1i = _mm_sub_epi32(x0i_2,_mm_add_epi32(x1r_2,_mm_sub_epi32(x2i_2,x3r_2))); + // *(y1) = cpack(dy1r,dy1i); + dy1r = vqsubq_s32(x1i_2,vqaddq_s32(x2r_2,x3i_2)); + dy1i = vqsubq_s32(vqsubq_s32(x3r_2,x2i_2),x1r_2); + *(y1) = vqaddq_s16(*(x0),cpack(dy1r,dy1i)); + // dy2r = _mm_sub_epi32(x0r_2,_mm_sub_epi32(x1r_2,_mm_sub_epi32(x2r_2,x3r_2))); + // dy2i = _mm_sub_epi32(x0i_2,_mm_sub_epi32(x1i_2,_mm_sub_epi32(x2i_2,x3i_2))); + // *(y2) = cpack(dy2r,dy2i); + dy2r = vqsubq_s32(vqsubq_s32(x2r_2,x3r_2),x1r_2); + dy2i = vqsubq_s32(vqsubq_s32(x2i_2,x3i_2),x1i_2); + *(y2) = vqaddq_s16(*(x0),cpack(dy2r,dy2i)); + // dy3r = _mm_sub_epi32(x0r_2,_mm_add_epi32(x1i_2,_mm_sub_epi32(x2r_2,x3i_2))); + // dy3i = _mm_add_epi32(x0i_2,_mm_sub_epi32(x1r_2,_mm_add_epi32(x2i_2,x3r_2))); + // *(y3) = cpack(dy3r,dy3i); + dy3r = vqsubq_s32(vqsubq_s32(x3i_2,x2r_2),x1i_2); + dy3i = vqsubq_s32(x1r_2,vqaddq_s32(x2i_2,x3r_2)); + *(y3) = vqaddq_s16(*(x0),cpack(dy3r,dy3i)); +} + +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void ibfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3, __m128i *tw1,__m128i *tw2,__m128i *tw3)__attribute__((always_inline)); @@ -445,24 +796,11 @@ static inline void ibfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, __m128i x1r_2,x1i_2,x2r_2,x2i_2,x3r_2,x3i_2,dy0r,dy0i,dy1r,dy1i,dy2r,dy2i,dy3r,dy3i; - // cmultc(*(x0),*(W0),&x0r_2,&x0i_2); + cmultc(*(x1),*(tw1),&x1r_2,&x1i_2); cmultc(*(x2),*(tw2),&x2r_2,&x2i_2); cmultc(*(x3),*(tw3),&x3r_2,&x3i_2); - /* - dy0r = _mm_add_epi32(x0r_2,_mm_add_epi32(x1r_2,_mm_add_epi32(x2r_2,x3r_2))); - dy0i = _mm_add_epi32(x0i_2,_mm_add_epi32(x1i_2,_mm_add_epi32(x2i_2,x3i_2))); - *(y0) = cpack(dy0r,dy0i); - dy3r = _mm_add_epi32(x0r_2,_mm_sub_epi32(x1i_2,_mm_add_epi32(x2r_2,x3i_2))); - dy3i = _mm_sub_epi32(x0i_2,_mm_add_epi32(x1r_2,_mm_sub_epi32(x2i_2,x3r_2))); - *(y3) = cpack(dy3r,dy3i); - dy2r = _mm_sub_epi32(x0r_2,_mm_sub_epi32(x1r_2,_mm_sub_epi32(x2r_2,x3r_2))); - dy2i = _mm_sub_epi32(x0i_2,_mm_sub_epi32(x1i_2,_mm_sub_epi32(x2i_2,x3i_2))); - *(y2) = cpack(dy2r,dy2i); - dy1r = _mm_sub_epi32(x0r_2,_mm_add_epi32(x1i_2,_mm_sub_epi32(x2r_2,x3i_2))); - dy1i = _mm_add_epi32(x0i_2,_mm_sub_epi32(x1r_2,_mm_add_epi32(x2i_2,x3r_2))); - *(y1) = cpack(dy1r,dy1i); - */ + dy0r = _mm_add_epi32(x1r_2,_mm_add_epi32(x2r_2,x3r_2)); dy0i = _mm_add_epi32(x1i_2,_mm_add_epi32(x2i_2,x3i_2)); *(y0) = _mm_add_epi16(*(x0),cpack(dy0r,dy0i)); @@ -477,6 +815,41 @@ static inline void ibfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, *(y1) = _mm_add_epi16(*(x0),cpack(dy1r,dy1i)); } +#elif defined(__arm__) + +static inline void ibfly4(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3)__attribute__((always_inline)); + +static inline void ibfly4(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3) +{ + + int32x4_t x1r_2,x1i_2,x2r_2,x2i_2,x3r_2,x3i_2,dy0r,dy0i,dy1r,dy1i,dy2r,dy2i,dy3r,dy3i; + + + cmultc(*(x1),*(tw1),&x1r_2,&x1i_2); + cmultc(*(x2),*(tw2),&x2r_2,&x2i_2); + cmultc(*(x3),*(tw3),&x3r_2,&x3i_2); + + dy0r = vqaddq_s32(x1r_2,vqaddq_s32(x2r_2,x3r_2)); + dy0i = vqaddq_s32(x1i_2,vqaddq_s32(x2i_2,x3i_2)); + *(y0) = vqaddq_s16(*(x0),cpack(dy0r,dy0i)); + dy3r = vqsubq_s32(x1i_2,vqaddq_s32(x2r_2,x3i_2)); + dy3i = vqsubq_s32(vqsubq_s32(x3r_2,x2i_2),x1r_2); + *(y3) = vqaddq_s16(*(x0),cpack(dy3r,dy3i)); + dy2r = vqsubq_s32(vqsubq_s32(x2r_2,x3r_2),x1r_2); + dy2i = vqsubq_s32(vqsubq_s32(x2i_2,x3i_2),x1i_2); + *(y2) = vqaddq_s16(*(x0),cpack(dy2r,dy2i)); + dy1r = vqsubq_s32(vqsubq_s32(x3i_2,x2r_2),x1i_2); + dy1i = vqsubq_s32(x1r_2,vqaddq_s32(x2i_2,x3r_2)); + *(y1) = vqaddq_s16(*(x0),cpack(dy1r,dy1i)); +} + +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void bfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3)__attribute__((always_inline)); @@ -484,24 +857,42 @@ static inline void bfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, static inline void bfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3) { - register __m128i x1_flip,x3_flip; *(y0) = _mm_adds_epi16(*(x0),_mm_adds_epi16(*(x1),_mm_adds_epi16(*(x2),*(x3)))); - x1_flip = _mm_sign_epi16(*(x1),*(__m128i*)conjugatedft); - // x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); - // x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); x3_flip = _mm_sign_epi16(*(x3),*(__m128i*)conjugatedft); - // x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); - // x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); *(y1) = _mm_adds_epi16(*(x0),_mm_subs_epi16(x1_flip,_mm_adds_epi16(*(x2),x3_flip))); *(y2) = _mm_subs_epi16(*(x0),_mm_subs_epi16(*(x1),_mm_subs_epi16(*(x2),*(x3)))); *(y3) = _mm_subs_epi16(*(x0),_mm_adds_epi16(x1_flip,_mm_subs_epi16(*(x2),x3_flip))); + } +#elif defined(__arm__) + +static inline void bfly4_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3)__attribute__((always_inline)); + +static inline void bfly4_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3) +{ + + register int16x8_t x1_flip,x3_flip; + + *(y0) = vqaddq_s16(*(x0),vqaddq_s16(*(x1),vqaddq_s16(*(x2),*(x3)))); + x1_flip = vrev32q_s16(vmulq_s16(*(x1),*(int16x8_t*)conjugatedft)); + x3_flip = vrev32q_s16(vmulq_s16(*(x3),*(int16x8_t*)conjugatedft)); + *(y1) = vqaddq_s16(*(x0),vqsubq_s16(x1_flip,vqaddq_s16(*(x2),x3_flip))); + *(y2) = vqsubq_s16(*(x0),vqsubq_s16(*(x1),vqsubq_s16(*(x2),*(x3)))); + *(y3) = vqsubq_s16(*(x0),vqaddq_s16(x1_flip,vqsubq_s16(*(x2),x3_flip))); +} + +#endif + +#if defined(__x86_64__) || defined(__i386__) + static inline void ibfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3)__attribute__((always_inline)); @@ -526,6 +917,28 @@ static inline void ibfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, *(y3) = _mm_adds_epi16(*(x0),_mm_subs_epi16(x1_flip,_mm_adds_epi16(*(x2),x3_flip))); } + +#elif defined(__arm__) +static inline void ibfly4_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3)__attribute__((always_inline)); + +static inline void ibfly4_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3) +{ + + register int16x8_t x1_flip,x3_flip; + + *(y0) = vqaddq_s16(*(x0),vqaddq_s16(*(x1),vqaddq_s16(*(x2),*(x3)))); + x1_flip = vrev32q_s16(vmulq_s16(*(x1),*(int16x8_t*)conjugatedft)); + x3_flip = vrev32q_s16(vmulq_s16(*(x3),*(int16x8_t*)conjugatedft)); + *(y1) = vqsubq_s16(*(x0),vqaddq_s16(x1_flip,vqsubq_s16(*(x2),x3_flip))); + *(y2) = vqsubq_s16(*(x0),vqsubq_s16(*(x1),vqsubq_s16(*(x2),*(x3)))); + *(y3) = vqaddq_s16(*(x0),vqsubq_s16(x1_flip,vqaddq_s16(*(x2),x3_flip))); +} + +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void bfly4_16(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3, __m128i *tw1,__m128i *tw2,__m128i *tw3, @@ -574,6 +987,42 @@ static inline void bfly4_16(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, } +#elif defined(__arm__) + +static inline void bfly4_16(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3, + int16x8_t *tw1b,int16x8_t *tw2b,int16x8_t *tw3b)__attribute__((always_inline)); + +static inline void bfly4_16(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3, + int16x8_t *tw1b,int16x8_t *tw2b,int16x8_t *tw3b) +{ + + register int16x8_t x1t,x2t,x3t,x02t,x13t; + register int16x8_t x1_flip,x3_flip; + + x1t = packed_cmult2(*(x1),*(tw1),*(tw1b)); + x2t = packed_cmult2(*(x2),*(tw2),*(tw2b)); + x3t = packed_cmult2(*(x3),*(tw3),*(tw3b)); + + + + x02t = vqaddq_s16(*(x0),x2t); + x13t = vqaddq_s16(x1t,x3t); + *(y0) = vqaddq_s16(x02t,x13t); + *(y2) = vqsubq_s16(x02t,x13t); + x1_flip = vrev32q_s16(vmulq_s16(x1t,*(int16x8_t*)conjugatedft)); + x3_flip = vrev32q_s16(vmulq_s16(x3t,*(int16x8_t*)conjugatedft)); + x02t = vqsubq_s16(*(x0),x2t); + x13t = vqsubq_s16(x1_flip,x3_flip); + *(y1) = vqaddq_s16(x02t,x13t); // x0 + x1f - x2 - x3f + *(y3) = vqsubq_s16(x02t,x13t); // x0 - x1f - x2 + x3f +} +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void ibfly4_16(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3, __m128i *tw1,__m128i *tw2,__m128i *tw3, @@ -622,6 +1071,40 @@ static inline void ibfly4_16(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3, } +#elif defined(__arm__) +static inline void ibfly4_16(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3, + int16x8_t *tw1b,int16x8_t *tw2b,int16x8_t *tw3b)__attribute__((always_inline)); + +static inline void ibfly4_16(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3, + int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3, + int16x8_t *tw1b,int16x8_t *tw2b,int16x8_t *tw3b) +{ + + register int16x8_t x1t,x2t,x3t,x02t,x13t; + register int16x8_t x1_flip,x3_flip; + + x1t = packed_cmult2(*(x1),*(tw1),*(tw1b)); + x2t = packed_cmult2(*(x2),*(tw2),*(tw2b)); + x3t = packed_cmult2(*(x3),*(tw3),*(tw3b)); + + x02t = vqaddq_s16(*(x0),x2t); + x13t = vqaddq_s16(x1t,x3t); + *(y0) = vqaddq_s16(x02t,x13t); + *(y2) = vqsubq_s16(x02t,x13t); + x1_flip = vrev32q_s16(vmulq_s16(x1t,*(int16x8_t*)conjugatedft)); + x3_flip = vrev32q_s16(vmulq_s16(x3t,*(int16x8_t*)conjugatedft)); + x02t = vqsubq_s16(*(x0),x2t); + x13t = vqsubq_s16(x1_flip,x3_flip); + *(y3) = vqaddq_s16(x02t,x13t); // x0 - x1f - x2 + x3f + *(y1) = vqsubq_s16(x02t,x13t); // x0 + x1f - x2 - x3f +} + +#endif + +#if defined(__x86_64__) || defined(__i386__) static inline void bfly5(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3,__m128i *x4, __m128i *y0, __m128i *y1, __m128i *y2, __m128i *y3,__m128i *y4, __m128i *tw1,__m128i *tw2,__m128i *tw3,__m128i *tw4)__attribute__((always_inline)); @@ -670,10 +1153,64 @@ static inline void bfly5(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3,__m1 *(y4) = _mm_adds_epi16(*(x0),*(y4)); +} + +#elif defined(__arm__) +static inline void bfly5(int16x8_t *x0, int16x8_t *x1, int16x8_t *x2, int16x8_t *x3,int16x8_t *x4, + int16x8_t *y0, int16x8_t *y1, int16x8_t *y2, int16x8_t *y3,int16x8_t *y4, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3,int16x8_t *tw4)__attribute__((always_inline)); + +static inline void bfly5(int16x8_t *x0, int16x8_t *x1, int16x8_t *x2, int16x8_t *x3,int16x8_t *x4, + int16x8_t *y0, int16x8_t *y1, int16x8_t *y2, int16x8_t *y3,int16x8_t *y4, + int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3,int16x8_t *tw4) +{ + + + + int16x8_t x1_2,x2_2,x3_2,x4_2; + int32x4_t tmpre,tmpim; + + packed_cmult(*(x1),*(tw1),&x1_2); + packed_cmult(*(x2),*(tw2),&x2_2); + packed_cmult(*(x3),*(tw3),&x3_2); + packed_cmult(*(x4),*(tw4),&x4_2); + + *(y0) = vqaddq_s16(*(x0),vqaddq_s16(x1_2,vqaddq_s16(x2_2,vqaddq_s16(x3_2,x4_2)))); + cmult(x1_2,*(W15),&tmpre,&tmpim); + cmac(x2_2,*(W25),&tmpre,&tmpim); + cmac(x3_2,*(W35),&tmpre,&tmpim); + cmac(x4_2,*(W45),&tmpre,&tmpim); + *(y1) = cpack(tmpre,tmpim); + *(y1) = vqaddq_s16(*(x0),*(y1)); + + cmult(x1_2,*(W25),&tmpre,&tmpim); + cmac(x2_2,*(W45),&tmpre,&tmpim); + cmac(x3_2,*(W15),&tmpre,&tmpim); + cmac(x4_2,*(W35),&tmpre,&tmpim); + *(y2) = cpack(tmpre,tmpim); + *(y2) = vqaddq_s16(*(x0),*(y2)); + + cmult(x1_2,*(W35),&tmpre,&tmpim); + cmac(x2_2,*(W15),&tmpre,&tmpim); + cmac(x3_2,*(W45),&tmpre,&tmpim); + cmac(x4_2,*(W25),&tmpre,&tmpim); + *(y3) = cpack(tmpre,tmpim); + *(y3) = vqaddq_s16(*(x0),*(y3)); + + cmult(x1_2,*(W45),&tmpre,&tmpim); + cmac(x2_2,*(W35),&tmpre,&tmpim); + cmac(x3_2,*(W25),&tmpre,&tmpim); + cmac(x4_2,*(W15),&tmpre,&tmpim); + *(y4) = cpack(tmpre,tmpim); + *(y4) = vqaddq_s16(*(x0),*(y4)); + + } +#endif +#if defined(__x86_64__) || defined(__i386__) static inline void bfly5_tw1(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3,__m128i *x4, __m128i *y0, __m128i *y1, __m128i *y2, __m128i *y3,__m128i *y4) __attribute__((always_inline)); @@ -710,9 +1247,48 @@ static inline void bfly5_tw1(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, *(y4) = _mm_adds_epi16(*(x0),*(y4)); } +#elif defined(__arm__) +static inline void bfly5_tw1(int16x8_t *x0, int16x8_t *x1, int16x8_t *x2, int16x8_t *x3,int16x8_t *x4, + int16x8_t *y0, int16x8_t *y1, int16x8_t *y2, int16x8_t *y3,int16x8_t *y4) __attribute__((always_inline)); + +static inline void bfly5_tw1(int16x8_t *x0, int16x8_t *x1, int16x8_t *x2, int16x8_t *x3,int16x8_t *x4, + int16x8_t *y0, int16x8_t *y1, int16x8_t *y2, int16x8_t *y3,int16x8_t *y4) +{ + + int32x4_t tmpre,tmpim; + + *(y0) = vqaddq_s16(*(x0),vqaddq_s16(*(x1),vqaddq_s16(*(x2),vqaddq_s16(*(x3),*(x4))))); + cmult(*(x1),*(W15),&tmpre,&tmpim); + cmac(*(x2),*(W25),&tmpre,&tmpim); + cmac(*(x3),*(W35),&tmpre,&tmpim); + cmac(*(x4),*(W45),&tmpre,&tmpim); + *(y1) = cpack(tmpre,tmpim); + *(y1) = vqaddq_s16(*(x0),*(y1)); + cmult(*(x1),*(W25),&tmpre,&tmpim); + cmac(*(x2),*(W45),&tmpre,&tmpim); + cmac(*(x3),*(W15),&tmpre,&tmpim); + cmac(*(x4),*(W35),&tmpre,&tmpim); + *(y2) = cpack(tmpre,tmpim); + *(y2) = vqaddq_s16(*(x0),*(y2)); + cmult(*(x1),*(W35),&tmpre,&tmpim); + cmac(*(x2),*(W15),&tmpre,&tmpim); + cmac(*(x3),*(W45),&tmpre,&tmpim); + cmac(*(x4),*(W25),&tmpre,&tmpim); + *(y3) = cpack(tmpre,tmpim); + *(y3) = vqaddq_s16(*(x0),*(y3)); + cmult(*(x1),*(W45),&tmpre,&tmpim); + cmac(*(x2),*(W35),&tmpre,&tmpim); + cmac(*(x3),*(W25),&tmpre,&tmpim); + cmac(*(x4),*(W15),&tmpre,&tmpim); + *(y4) = cpack(tmpre,tmpim); + *(y4) = vqaddq_s16(*(x0),*(y4)); +} + +#endif // performs 4x4 transpose of input x (complex interleaved) using 128bit SIMD intrinsics // i.e. x = [x0r x0i x1r x1i ... x15r x15i], y = [x0r x0i x4r x4i x8r x8i x12r x12i x1r x1i x5r x5i x9r x9i x13r x13i x2r x2i ... x15r x15i] +#if defined(__x86_64__) || defined(__i386__) static inline void transpose16(__m128i *x,__m128i *y) __attribute__((always_inline)); static inline void transpose16(__m128i *x,__m128i *y) { @@ -728,7 +1304,24 @@ static inline void transpose16(__m128i *x,__m128i *y) y[3] = _mm_unpackhi_epi64(ytmp1,ytmp3); } +#elif defined(__arm__) +static inline void transpose16(int16x8_t *x,int16x8_t *y) __attribute__((always_inline)); +static inline void transpose16(int16x8_t *x,int16x8_t *y) +{ + register uint32x4x2_t ytmp0,ytmp1; + + ytmp0 = vtrnq_u32((uint32x4_t)(x[0]),(uint32x4_t)(x[1])); + ytmp1 = vtrnq_u32((uint32x4_t)(x[2]),(uint32x4_t)(x[3])); + + y[0] = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[0]),vget_low_s16((int16x8_t)ytmp1.val[0])); + y[1] = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[0]),vget_high_s16((int16x8_t)ytmp1.val[0])); + y[2] = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[1]),vget_low_s16((int16x8_t)ytmp1.val[1])); + y[3] = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[1]),vget_high_s16((int16x8_t)ytmp1.val[1])); +} + +# endif // same as above but output is offset by off +#if defined(__x86_64__) || defined(__i386__) static inline void transpose16_ooff(__m128i *x,__m128i *y,int off) __attribute__((always_inline)); static inline void transpose16_ooff(__m128i *x,__m128i *y,int off) @@ -749,12 +1342,47 @@ static inline void transpose16_ooff(__m128i *x,__m128i *y,int off) *y2 = _mm_unpackhi_epi64(ytmp1,ytmp3); } +#elif defined(__arm__) +static inline void transpose16_ooff(int16x8_t *x,int16x8_t *y,int off) __attribute__((always_inline)); + +static inline void transpose16_ooff(int16x8_t *x,int16x8_t *y,int off) +{ + int16x8_t *y2=y; + register uint32x4x2_t ytmp0,ytmp1; + + ytmp0 = vtrnq_u32((uint32x4_t)(x[0]),(uint32x4_t)(x[1])); + ytmp1 = vtrnq_u32((uint32x4_t)(x[2]),(uint32x4_t)(x[3])); + + *y2 = (int16x8_t)vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[0]),vget_low_s16((int16x8_t)ytmp1.val[0])); y2+=off; + *y2 = (int16x8_t)vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[1]),vget_low_s16((int16x8_t)ytmp1.val[1])); y2+=off; + *y2 = (int16x8_t)vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[0]),vget_high_s16((int16x8_t)ytmp1.val[0])); y2+=off; + *y2 = (int16x8_t)vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[1]),vget_high_s16((int16x8_t)ytmp1.val[1])); + + +} + +#endif + +#if defined(__x86_64__) || defined(__i386__) + static inline void transpose4_ooff(__m64 *x,__m64 *y,int off)__attribute__((always_inline)); static inline void transpose4_ooff(__m64 *x,__m64 *y,int off) { y[0] = _mm_unpacklo_pi32(x[0],x[1]); y[off] = _mm_unpackhi_pi32(x[0],x[1]); } +#elif (__arm__) + +static inline void transpose4_ooff(int16x4_t *x,int16x4_t *y,int off)__attribute__((always_inline)); +static inline void transpose4_ooff(int16x4_t *x,int16x4_t *y,int off) +{ + uint32x2x2_t ytmp = vtrn_u32((uint32x2_t)x[0],(uint32x2_t)x[1]); + + y[0] = (int16x4_t)ytmp.val[0]; + y[off] = (int16x4_t)ytmp.val[1]; +} + +#endif // 16-point optimized DFT kernel @@ -778,14 +1406,19 @@ int16_t tw16c[24] __attribute__((aligned(16))) = { 0,32767,12540,30272,23170,231 0,32767,30273,12539,23170,-23170,-12539,-30273 }; + + static inline void dft16(int16_t *x,int16_t *y) __attribute__((always_inline)); static inline void dft16(int16_t *x,int16_t *y) { +#if defined(__x86_64__) || defined(__i386__) + __m128i *tw16a_128=(__m128i *)tw16a,*tw16b_128=(__m128i *)tw16b,*x128=(__m128i *)x,*y128=(__m128i *)y; - /* + /* This is the original version before unrolling + bfly4_tw1(x128,x128+1,x128+2,x128+3, y128,y128+1,y128+2,y128+3); @@ -805,27 +1438,14 @@ static inline void dft16(int16_t *x,int16_t *y) x13t = _mm_adds_epi16(x128[1],x128[3]); xtmp0 = _mm_adds_epi16(x02t,x13t); xtmp2 = _mm_subs_epi16(x02t,x13t); - - /* - xtmp0 = _mm_adds_epi16(x128[0],_mm_adds_epi16(x128[1],_mm_adds_epi16(x128[2],x128[3]))); - xtmp2 = _mm_subs_epi16(x128[0],_mm_subs_epi16(x128[1],_mm_subs_epi16(x128[2],x128[3]))); - */ x1_flip = _mm_sign_epi16(x128[1],*(__m128i*)conjugatedft); - // x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); - // x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); x3_flip = _mm_sign_epi16(x128[3],*(__m128i*)conjugatedft); - // x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); - // x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); x02t = _mm_subs_epi16(x128[0],x128[2]); x13t = _mm_subs_epi16(x1_flip,x3_flip); xtmp1 = _mm_adds_epi16(x02t,x13t); // x0 + x1f - x2 - x3f xtmp3 = _mm_subs_epi16(x02t,x13t); // x0 - x1f - x2 + x3f - /* - xtmp1 = _mm_adds_epi16(x128[0],_mm_subs_epi16(x1_flip,_mm_adds_epi16(x128[2],x3_flip))); - xtmp3 = _mm_subs_epi16(x128[0],_mm_adds_epi16(x1_flip,_mm_subs_epi16(x128[2],x3_flip))); - */ ytmp0 = _mm_unpacklo_epi32(xtmp0,xtmp1); ytmp1 = _mm_unpackhi_epi32(xtmp0,xtmp1); @@ -845,28 +1465,84 @@ static inline void dft16(int16_t *x,int16_t *y) x13t = _mm_adds_epi16(xtmp1,xtmp3); y128[0] = _mm_adds_epi16(x02t,x13t); y128[2] = _mm_subs_epi16(x02t,x13t); - - /* - y128[0] = _mm_adds_epi16(xtmp0,_mm_adds_epi16(xtmp1,_mm_adds_epi16(xtmp2,xtmp3))); - y128[2] = _mm_subs_epi16(xtmp0,_mm_subs_epi16(xtmp1,_mm_subs_epi16(xtmp2,xtmp3))); - */ - x1_flip = _mm_sign_epi16(xtmp1,*(__m128i*)conjugatedft); - // x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); - // x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); x3_flip = _mm_sign_epi16(xtmp3,*(__m128i*)conjugatedft); - // x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); - // x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); x02t = _mm_subs_epi16(xtmp0,xtmp2); x13t = _mm_subs_epi16(x1_flip,x3_flip); y128[1] = _mm_adds_epi16(x02t,x13t); // x0 + x1f - x2 - x3f y128[3] = _mm_subs_epi16(x02t,x13t); // x0 - x1f - x2 + x3f - /* - y128[1] = _mm_adds_epi16(xtmp0,_mm_subs_epi16(x1_flip,_mm_adds_epi16(xtmp2,x3_flip))); - y128[3] = _mm_subs_epi16(xtmp0,_mm_adds_epi16(x1_flip,_mm_subs_epi16(xtmp2,x3_flip))); + + +#elif defined(__arm__) + + int16x8_t *tw16a_128=(int16x8_t *)tw16a,*tw16b_128=(int16x8_t *)tw16b,*x128=(int16x8_t *)x,*y128=(int16x8_t *)y; + + /* This is the original version before unrolling + + bfly4_tw1(x128,x128+1,x128+2,x128+3, + y128,y128+1,y128+2,y128+3); + + transpose16(y128,ytmp); + + bfly4_16(ytmp,ytmp+1,ytmp+2,ytmp+3, + y128,y128+1,y128+2,y128+3, + tw16_128,tw16_128+1,tw16_128+2); */ + + register int16x8_t x1_flip,x3_flip,x02t,x13t; + register int16x8_t xtmp0,xtmp1,xtmp2,xtmp3; + register uint32x4x2_t ytmp0,ytmp1; + register int16x8_t ytmp0b,ytmp1b,ytmp2b,ytmp3b; + + // First stage : 4 Radix-4 butterflies without input twiddles + + x02t = vqaddq_s16(x128[0],x128[2]); + x13t = vqaddq_s16(x128[1],x128[3]); + xtmp0 = vqaddq_s16(x02t,x13t); + xtmp2 = vqsubq_s16(x02t,x13t); + x1_flip = vrev32q_s16(vmulq_s16(x128[1],*(int16x8_t*)conjugatedft)); + x3_flip = vrev32q_s16(vmulq_s16(x128[3],*(int16x8_t*)conjugatedft)); + x02t = vqsubq_s16(x128[0],x128[2]); + x13t = vqsubq_s16(x1_flip,x3_flip); + xtmp1 = vqaddq_s16(x02t,x13t); // x0 + x1f - x2 - x3f + xtmp3 = vqsubq_s16(x02t,x13t); // x0 - x1f - x2 + x3f + + ytmp0 = vtrnq_u32((uint32x4_t)(xtmp0),(uint32x4_t)(xtmp1)); +// y0[0] = [x00 x10 x02 x12], y0[1] = [x01 x11 x03 x13] + ytmp1 = vtrnq_u32((uint32x4_t)(xtmp2),(uint32x4_t)(xtmp3)); +// y1[0] = [x20 x30 x22 x32], y1[1] = [x21 x31 x23 x33] + + + ytmp0b = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[0]),vget_low_s16((int16x8_t)ytmp1.val[0])); +// y0 = [x00 x10 x20 x30] + ytmp1b = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[1]),vget_low_s16((int16x8_t)ytmp1.val[1])); +// t1 = [x01 x11 x21 x31] + ytmp2b = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[0]),vget_high_s16((int16x8_t)ytmp1.val[0])); +// t2 = [x02 x12 x22 x32] + ytmp3b = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[1]),vget_high_s16((int16x8_t)ytmp1.val[1])); +// t3 = [x03 x13 x23 x33] + + + // Second stage : 4 Radix-4 butterflies with input twiddles + xtmp1 = packed_cmult2(ytmp1b,tw16a_128[0],tw16b_128[0]); + xtmp2 = packed_cmult2(ytmp2b,tw16a_128[1],tw16b_128[1]); + xtmp3 = packed_cmult2(ytmp3b,tw16a_128[2],tw16b_128[2]); + + x02t = vqaddq_s16(ytmp0b,xtmp2); + x13t = vqaddq_s16(xtmp1,xtmp3); + y128[0] = vqaddq_s16(x02t,x13t); + y128[2] = vqsubq_s16(x02t,x13t); + x1_flip = vrev32q_s16(vmulq_s16(xtmp1,*(int16x8_t*)conjugatedft)); + x3_flip = vrev32q_s16(vmulq_s16(xtmp3,*(int16x8_t*)conjugatedft)); + x02t = vqsubq_s16(ytmp0b,xtmp2); + x13t = vqsubq_s16(x1_flip,x3_flip); + y128[1] = vqaddq_s16(x02t,x13t); // x0 + x1f - x2 - x3f + y128[3] = vqsubq_s16(x02t,x13t); // x0 - x1f - x2 + x3f + + +#endif } static inline void idft16(int16_t *x,int16_t *y) __attribute__((always_inline)); @@ -874,6 +1550,7 @@ static inline void idft16(int16_t *x,int16_t *y) __attribute__((always_inline)); static inline void idft16(int16_t *x,int16_t *y) { +#if defined(__x86_64__) || defined(__i386__) __m128i *tw16a_128=(__m128i *)tw16,*tw16b_128=(__m128i *)tw16c,*x128=(__m128i *)x,*y128=(__m128i *)y; /* @@ -896,27 +1573,14 @@ static inline void idft16(int16_t *x,int16_t *y) x13t = _mm_adds_epi16(x128[1],x128[3]); xtmp0 = _mm_adds_epi16(x02t,x13t); xtmp2 = _mm_subs_epi16(x02t,x13t); - - /* - xtmp0 = _mm_adds_epi16(x128[0],_mm_adds_epi16(x128[1],_mm_adds_epi16(x128[2],x128[3]))); - xtmp2 = _mm_subs_epi16(x128[0],_mm_subs_epi16(x128[1],_mm_subs_epi16(x128[2],x128[3]))); - */ x1_flip = _mm_sign_epi16(x128[1],*(__m128i*)conjugatedft); - // x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); - // x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); x3_flip = _mm_sign_epi16(x128[3],*(__m128i*)conjugatedft); - // x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); - // x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); x02t = _mm_subs_epi16(x128[0],x128[2]); x13t = _mm_subs_epi16(x1_flip,x3_flip); xtmp3 = _mm_adds_epi16(x02t,x13t); // x0 + x1f - x2 - x3f xtmp1 = _mm_subs_epi16(x02t,x13t); // x0 - x1f - x2 + x3f - /* - xtmp1 = _mm_adds_epi16(x128[0],_mm_subs_epi16(x1_flip,_mm_adds_epi16(x128[2],x3_flip))); - xtmp3 = _mm_subs_epi16(x128[0],_mm_adds_epi16(x1_flip,_mm_subs_epi16(x128[2],x3_flip))); - */ ytmp0 = _mm_unpacklo_epi32(xtmp0,xtmp1); ytmp1 = _mm_unpackhi_epi32(xtmp0,xtmp1); @@ -936,53 +1600,84 @@ static inline void idft16(int16_t *x,int16_t *y) x13t = _mm_adds_epi16(xtmp1,xtmp3); y128[0] = _mm_adds_epi16(x02t,x13t); y128[2] = _mm_subs_epi16(x02t,x13t); - - /* - y128[0] = _mm_adds_epi16(xtmp0,_mm_adds_epi16(xtmp1,_mm_adds_epi16(xtmp2,xtmp3))); - y128[2] = _mm_subs_epi16(xtmp0,_mm_subs_epi16(xtmp1,_mm_subs_epi16(xtmp2,xtmp3))); - */ - x1_flip = _mm_sign_epi16(xtmp1,*(__m128i*)conjugatedft); - // x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); - // x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1)); x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); x3_flip = _mm_sign_epi16(xtmp3,*(__m128i*)conjugatedft); - // x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); - // x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1)); x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)); x02t = _mm_subs_epi16(xtmp0,xtmp2); x13t = _mm_subs_epi16(x1_flip,x3_flip); y128[3] = _mm_adds_epi16(x02t,x13t); // x0 + x1f - x2 - x3f y128[1] = _mm_subs_epi16(x02t,x13t); // x0 - x1f - x2 + x3f - /* - y128[1] = _mm_adds_epi16(xtmp0,_mm_subs_epi16(x1_flip,_mm_adds_epi16(xtmp2,x3_flip))); - y128[3] = _mm_subs_epi16(xtmp0,_mm_adds_epi16(x1_flip,_mm_subs_epi16(xtmp2,x3_flip))); - */ -} - -/* -static inline void idft16(int16_t *x,int16_t *y)__attribute__((always_inline)); -static inline void idft16(int16_t *x,int16_t *y) { +#elif defined(__arm__) + int16x8_t *tw16a_128=(int16x8_t *)tw16,*tw16b_128=(int16x8_t *)tw16c,*x128=(int16x8_t *)x,*y128=(int16x8_t *)y; - __m128i ytmp[4],*tw16_128=(__m128i *)tw16,*x128=(__m128i *)x,*y128=(__m128i *)y; + /* This is the original version before unrolling - - ibfly4_tw1(x128,x128+1,x128+2,x128+3, + bfly4_tw1(x128,x128+1,x128+2,x128+3, y128,y128+1,y128+2,y128+3); transpose16(y128,ytmp); - ibfly4(ytmp,ytmp+1,ytmp+2,ytmp+3, - y128,y128+1,y128+2,y128+3, - tw16_128,tw16_128+1,tw16_128+2); + bfly4_16(ytmp,ytmp+1,ytmp+2,ytmp+3, + y128,y128+1,y128+2,y128+3, + tw16_128,tw16_128+1,tw16_128+2); + */ + + register int16x8_t x1_flip,x3_flip,x02t,x13t; + register int16x8_t xtmp0,xtmp1,xtmp2,xtmp3; + register uint32x4x2_t ytmp0,ytmp1; + register int16x8_t ytmp0b,ytmp1b,ytmp2b,ytmp3b; -} -*/ + // First stage : 4 Radix-4 butterflies without input twiddles + x02t = vqaddq_s16(x128[0],x128[2]); + x13t = vqaddq_s16(x128[1],x128[3]); + xtmp0 = vqaddq_s16(x02t,x13t); + xtmp2 = vqsubq_s16(x02t,x13t); + x1_flip = vrev32q_s16(vmulq_s16(x128[1],*(int16x8_t*)conjugatedft)); + x3_flip = vrev32q_s16(vmulq_s16(x128[3],*(int16x8_t*)conjugatedft)); + x02t = vqsubq_s16(x128[0],x128[2]); + x13t = vqsubq_s16(x1_flip,x3_flip); + xtmp3 = vqaddq_s16(x02t,x13t); // x0 + x1f - x2 - x3f + xtmp1 = vqsubq_s16(x02t,x13t); // x0 - x1f - x2 + x3f + + ytmp0 = vtrnq_u32((uint32x4_t)(xtmp0),(uint32x4_t)(xtmp1)); +// y0[0] = [x00 x10 x02 x12], y0[1] = [x01 x11 x03 x13] + ytmp1 = vtrnq_u32((uint32x4_t)(xtmp2),(uint32x4_t)(xtmp3)); +// y1[0] = [x20 x30 x22 x32], y1[1] = [x21 x31 x23 x33] + + + ytmp0b = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[0]),vget_low_s16((int16x8_t)ytmp1.val[0])); +// y0 = [x00 x10 x20 x30] + ytmp1b = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[1]),vget_low_s16((int16x8_t)ytmp1.val[1])); +// t1 = [x01 x11 x21 x31] + ytmp2b = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[0]),vget_high_s16((int16x8_t)ytmp1.val[0])); +// t2 = [x02 x12 x22 x32] + ytmp3b = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[1]),vget_high_s16((int16x8_t)ytmp1.val[1])); +// t3 = [x03 x13 x23 x33] + // Second stage : 4 Radix-4 butterflies with input twiddles + xtmp1 = packed_cmult2(ytmp1b,tw16a_128[0],tw16b_128[0]); + xtmp2 = packed_cmult2(ytmp2b,tw16a_128[1],tw16b_128[1]); + xtmp3 = packed_cmult2(ytmp3b,tw16a_128[2],tw16b_128[2]); + + x02t = vqaddq_s16(ytmp0b,xtmp2); + x13t = vqaddq_s16(xtmp1,xtmp3); + y128[0] = vqaddq_s16(x02t,x13t); + y128[2] = vqsubq_s16(x02t,x13t); + x1_flip = vrev32q_s16(vmulq_s16(xtmp1,*(int16x8_t*)conjugatedft)); + x3_flip = vrev32q_s16(vmulq_s16(xtmp3,*(int16x8_t*)conjugatedft)); + x02t = vqsubq_s16(ytmp0b,xtmp2); + x13t = vqsubq_s16(x1_flip,x3_flip); + y128[3] = vqaddq_s16(x02t,x13t); // x0 + x1f - x2 - x3f + y128[1] = vqsubq_s16(x02t,x13t); // x0 - x1f - x2 + x3f -// 64-point optimized DFT kernel +#endif +} + + +// 64-point optimized DFT int16_t tw64[96] __attribute__((aligned(16))) = { 32767,0,32609,-3212,32137,-6393,31356,-9512,30272,-12540,28897,-15447,27244,-18205,25329,-20788,23169,-23170,20787,-25330,18204,-27245,15446,-28898,12539,-30273,9511,-31357,6392,-32138,3211,-32610, 32767,0,32137,-6393,30272,-12540,27244,-18205,23169,-23170,18204,-27245,12539,-30273,6392,-32138,0,-32767,-6393,-32138,-12540,-30273,-18205,-27245,-23170,-23170,-27245,-18205,-30273,-12540,-32138,-6393, @@ -1005,11 +1700,26 @@ int16_t tw64c[96] __attribute__((aligned(16))) = { 0,32767,3212,32609,6393,32137 }; +#if defined(__x86_64__) || defined(__i386__) +#define simd_q15_t __m128i +#define simdshort_q15_t __m64 +#define shiftright_int16(a,shift) _mm_srai_epi16(a,shift) +#define set1_int16(a) _mm_set1_epi16(a); +#define mulhi_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),1); +#elif defined(__arm__) +#define simd_q15_t int16x8_t +#define simdshort_q15_t int16x4_t +#define shiftright_int16(a,shift) vshrq_n_s16(a,shift) +#define set1_int16(a) vdupq_n_s16(a) +#define mulhi_int16(a,b) vqdmulhq_s16(a,b); +#define _mm_empty() +#define _m_empty() +#endif void dft64(int16_t *x,int16_t *y,int scale) { - __m128i xtmp[16],ytmp[16],*tw64a_128=(__m128i *)tw64a,*tw64b_128=(__m128i *)tw64b,*x128=(__m128i *)x,*y128=(__m128i *)y; + simd_q15_t xtmp[16],ytmp[16],*tw64a_128=(simd_q15_t *)tw64a,*tw64b_128=(simd_q15_t *)tw64b,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y; #ifdef D64STATS @@ -1073,24 +1783,22 @@ void dft64(int16_t *x,int16_t *y,int scale) if (scale>0) { - - y128[0] = _mm_srai_epi16(y128[0],3); - y128[1] = _mm_srai_epi16(y128[1],3); - y128[2] = _mm_srai_epi16(y128[2],3); - y128[3] = _mm_srai_epi16(y128[3],3); - y128[4] = _mm_srai_epi16(y128[4],3); - y128[5] = _mm_srai_epi16(y128[5],3); - y128[6] = _mm_srai_epi16(y128[6],3); - y128[7] = _mm_srai_epi16(y128[7],3); - y128[8] = _mm_srai_epi16(y128[8],3); - y128[9] = _mm_srai_epi16(y128[9],3); - y128[10] = _mm_srai_epi16(y128[10],3); - y128[11] = _mm_srai_epi16(y128[11],3); - y128[12] = _mm_srai_epi16(y128[12],3); - y128[13] = _mm_srai_epi16(y128[13],3); - y128[14] = _mm_srai_epi16(y128[14],3); - y128[15] = _mm_srai_epi16(y128[15],3); - + y128[0] = shiftright_int16(y128[0],3); + y128[1] = shiftright_int16(y128[1],3); + y128[2] = shiftright_int16(y128[2],3); + y128[3] = shiftright_int16(y128[3],3); + y128[4] = shiftright_int16(y128[4],3); + y128[5] = shiftright_int16(y128[5],3); + y128[6] = shiftright_int16(y128[6],3); + y128[7] = shiftright_int16(y128[7],3); + y128[8] = shiftright_int16(y128[8],3); + y128[9] = shiftright_int16(y128[9],3); + y128[10] = shiftright_int16(y128[10],3); + y128[11] = shiftright_int16(y128[11],3); + y128[12] = shiftright_int16(y128[12],3); + y128[13] = shiftright_int16(y128[13],3); + y128[14] = shiftright_int16(y128[14],3); + y128[15] = shiftright_int16(y128[15],3); } _mm_empty(); @@ -1101,7 +1809,7 @@ void dft64(int16_t *x,int16_t *y,int scale) void idft64(int16_t *x,int16_t *y,int scale) { - __m128i xtmp[16],ytmp[16],*tw64a_128=(__m128i *)tw64,*tw64b_128=(__m128i *)tw64c,*x128=(__m128i *)x,*y128=(__m128i *)y; + simd_q15_t xtmp[16],ytmp[16],*tw64a_128=(simd_q15_t *)tw64,*tw64b_128=(simd_q15_t *)tw64c,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y; #ifdef D64STATS @@ -1166,22 +1874,22 @@ void idft64(int16_t *x,int16_t *y,int scale) if (scale>0) { - y128[0] = _mm_srai_epi16(y128[0],3); - y128[1] = _mm_srai_epi16(y128[1],3); - y128[2] = _mm_srai_epi16(y128[2],3); - y128[3] = _mm_srai_epi16(y128[3],3); - y128[4] = _mm_srai_epi16(y128[4],3); - y128[5] = _mm_srai_epi16(y128[5],3); - y128[6] = _mm_srai_epi16(y128[6],3); - y128[7] = _mm_srai_epi16(y128[7],3); - y128[8] = _mm_srai_epi16(y128[8],3); - y128[9] = _mm_srai_epi16(y128[9],3); - y128[10] = _mm_srai_epi16(y128[10],3); - y128[11] = _mm_srai_epi16(y128[11],3); - y128[12] = _mm_srai_epi16(y128[12],3); - y128[13] = _mm_srai_epi16(y128[13],3); - y128[14] = _mm_srai_epi16(y128[14],3); - y128[15] = _mm_srai_epi16(y128[15],3); + y128[0] = shiftright_int16(y128[0],3); + y128[1] = shiftright_int16(y128[1],3); + y128[2] = shiftright_int16(y128[2],3); + y128[3] = shiftright_int16(y128[3],3); + y128[4] = shiftright_int16(y128[4],3); + y128[5] = shiftright_int16(y128[5],3); + y128[6] = shiftright_int16(y128[6],3); + y128[7] = shiftright_int16(y128[7],3); + y128[8] = shiftright_int16(y128[8],3); + y128[9] = shiftright_int16(y128[9],3); + y128[10] = shiftright_int16(y128[10],3); + y128[11] = shiftright_int16(y128[11],3); + y128[12] = shiftright_int16(y128[12],3); + y128[13] = shiftright_int16(y128[13],3); + y128[14] = shiftright_int16(y128[14],3); + y128[15] = shiftright_int16(y128[15],3); } @@ -1191,64 +1899,6 @@ void idft64(int16_t *x,int16_t *y,int scale) } -/* -void idft64(int16_t *x,int16_t *y,int scale) { - - __m128i xtmp[16],ytmp[16],*tw64_128=(__m128i *)tw64,*x128=(__m128i *)x,*y128=(__m128i *)y; - - transpose16_ooff(x128,xtmp,4); - transpose16_ooff(x128+4,xtmp+1,4); - transpose16_ooff(x128+8,xtmp+2,4); - transpose16_ooff(x128+12,xtmp+3,4); - - idft16((int16_t*)(xtmp),(int16_t*)ytmp); - idft16((int16_t*)(xtmp+4),(int16_t*)(ytmp+4)); - idft16((int16_t*)(xtmp+8),(int16_t*)(ytmp+8)); - idft16((int16_t*)(xtmp+12),(int16_t*)(ytmp+12)); - - - ibfly4(ytmp,ytmp+4,ytmp+8,ytmp+12, - y128,y128+4,y128+8,y128+12, - tw64_128,tw64_128+4,tw64_128+8); - - ibfly4(ytmp+1,ytmp+5,ytmp+9,ytmp+13, - y128+1,y128+5,y128+9,y128+13, - tw64_128+1,tw64_128+5,tw64_128+9); - - ibfly4(ytmp+2,ytmp+6,ytmp+10,ytmp+14, - y128+2,y128+6,y128+10,y128+14, - tw64_128+2,tw64_128+6,tw64_128+10); - - ibfly4(ytmp+3,ytmp+7,ytmp+11,ytmp+15, - y128+3,y128+7,y128+11,y128+15, - tw64_128+3,tw64_128+7,tw64_128+11); - - if (scale>0) { - - y128[0] = _mm_srai_epi16(y128[0],3); - y128[1] = _mm_srai_epi16(y128[1],3); - y128[2] = _mm_srai_epi16(y128[2],3); - y128[3] = _mm_srai_epi16(y128[3],3); - y128[4] = _mm_srai_epi16(y128[4],3); - y128[5] = _mm_srai_epi16(y128[5],3); - y128[6] = _mm_srai_epi16(y128[6],3); - y128[7] = _mm_srai_epi16(y128[7],3); - y128[8] = _mm_srai_epi16(y128[8],3); - y128[9] = _mm_srai_epi16(y128[9],3); - y128[10] = _mm_srai_epi16(y128[10],3); - y128[11] = _mm_srai_epi16(y128[11],3); - y128[12] = _mm_srai_epi16(y128[12],3); - y128[13] = _mm_srai_epi16(y128[13],3); - y128[14] = _mm_srai_epi16(y128[14],3); - y128[15] = _mm_srai_epi16(y128[15],3); - - } - _mm_empty(); - _m_empty(); - -} -*/ - int16_t tw128[128] __attribute__((aligned(16))) = { 32767,0,32727,-1608,32609,-3212,32412,-4808,32137,-6393,31785,-7962,31356,-9512,30851,-11039,30272,-12540,29621,-14010,28897,-15447,28105,-16846,27244,-18205,26318,-19520,25329,-20788,24278,-22005,23169,-23170,22004,-24279,20787,-25330,19519,-26319,18204,-27245,16845,-28106,15446,-28898,14009,-29622,12539,-30273,11038,-30852,9511,-31357,7961,-31786,6392,-32138,4807,-32413,3211,-32610,1607,-32728,0,-32767,-1608,-32728,-3212,-32610,-4808,-32413,-6393,-32138,-7962,-31786,-9512,-31357,-11039,-30852,-12540,-30273,-14010,-29622,-15447,-28898,-16846,-28106,-18205,-27245,-19520,-26319,-20788,-25330,-22005,-24279,-23170,-23170,-24279,-22005,-25330,-20788,-26319,-19520,-27245,-18205,-28106,-16846,-28898,-15447,-29622,-14010,-30273,-12540,-30852,-11039,-31357,-9512,-31786,-7962,-32138,-6393,-32413,-4808,-32610,-3212,-32728,-1608}; int16_t tw128a[128] __attribute__((aligned(16))) = { 32767,0,32727,1608,32609,3212,32412,4808,32137,6393,31785,7962,31356,9512,30851,11039,30272,12540,29621,14010,28897,15447,28105,16846,27244,18205,26318,19520,25329,20788,24278,22005,23169,23170,22004,24279,20787,25330,19519,26319,18204,27245,16845,28106,15446,28898,14009,29622,12539,30273,11038,30852,9511,31357,7961,31786,6392,32138,4807,32413,3211,32610,1607,32728,0,32767,-1608,32728,-3212,32610,-4808,32413,-6393,32138,-7962,31786,-9512,31357,-11039,30852,-12540,30273,-14010,29622,-15447,28898,-16846,28106,-18205,27245,-19520,26319,-20788,25330,-22005,24279,-23170,23170,-24279,22005,-25330,20788,-26319,19520,-27245,18205,-28106,16846,-28898,15447,-29622,14010,-30273,12540,-30852,11039,-31357,9512,-31786,7962,-32138,6393,-32413,4808,-32610,3212,-32728,1608}; @@ -1260,18 +1910,12 @@ int16_t tw128c[128] __attribute__((aligned(16))) = {0,32767,1608,32727,3212,3260 void dft128(int16_t *x,int16_t *y,int scale) { - __m64 xtmp[64],*x64 = (__m64 *)x; - __m128i ytmp[32],*tw128a_128p=(__m128i *)tw128a,*tw128b_128p=(__m128i *)tw128b,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simdshort_q15_t xtmp[64],*x64 = (simdshort_q15_t *)x; + simd_q15_t ytmp[32],*tw128a_128p=(simd_q15_t *)tw128a,*tw128b_128p=(simd_q15_t *)tw128b,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i; - __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15); + simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15); + transpose4_ooff(x64 ,xtmp,32); transpose4_ooff(x64+2,xtmp+1,32); @@ -1323,71 +1967,39 @@ void dft128(int16_t *x,int16_t *y,int scale) if (scale>0) { - y128[0] = _mm_mulhi_epi16(y128[0],ONE_OVER_SQRT2_Q15_128); - y128[0] = _mm_slli_epi16(y128[0],1); - y128[1] = _mm_mulhi_epi16(y128[1],ONE_OVER_SQRT2_Q15_128); - y128[1] = _mm_slli_epi16(y128[1],1); - y128[2] = _mm_mulhi_epi16(y128[2],ONE_OVER_SQRT2_Q15_128); - y128[2] = _mm_slli_epi16(y128[2],1); - y128[3] = _mm_mulhi_epi16(y128[3],ONE_OVER_SQRT2_Q15_128); - y128[3] = _mm_slli_epi16(y128[3],1); - y128[4] = _mm_mulhi_epi16(y128[4],ONE_OVER_SQRT2_Q15_128); - y128[4] = _mm_slli_epi16(y128[4],1); - y128[5] = _mm_mulhi_epi16(y128[5],ONE_OVER_SQRT2_Q15_128); - y128[5] = _mm_slli_epi16(y128[5],1); - y128[6] = _mm_mulhi_epi16(y128[6],ONE_OVER_SQRT2_Q15_128); - y128[6] = _mm_slli_epi16(y128[6],1); - y128[7] = _mm_mulhi_epi16(y128[7],ONE_OVER_SQRT2_Q15_128); - y128[7] = _mm_slli_epi16(y128[7],1); - y128[8] = _mm_mulhi_epi16(y128[8],ONE_OVER_SQRT2_Q15_128); - y128[8] = _mm_slli_epi16(y128[8],1); - y128[9] = _mm_mulhi_epi16(y128[9],ONE_OVER_SQRT2_Q15_128); - y128[9] = _mm_slli_epi16(y128[9],1); - y128[10] = _mm_mulhi_epi16(y128[10],ONE_OVER_SQRT2_Q15_128); - y128[10] = _mm_slli_epi16(y128[10],1); - y128[11] = _mm_mulhi_epi16(y128[11],ONE_OVER_SQRT2_Q15_128); - y128[11] = _mm_slli_epi16(y128[11],1); - y128[12] = _mm_mulhi_epi16(y128[12],ONE_OVER_SQRT2_Q15_128); - y128[12] = _mm_slli_epi16(y128[12],1); - y128[13] = _mm_mulhi_epi16(y128[13],ONE_OVER_SQRT2_Q15_128); - y128[13] = _mm_slli_epi16(y128[13],1); - y128[14] = _mm_mulhi_epi16(y128[14],ONE_OVER_SQRT2_Q15_128); - y128[14] = _mm_slli_epi16(y128[14],1); - y128[15] = _mm_mulhi_epi16(y128[15],ONE_OVER_SQRT2_Q15_128); - y128[15] = _mm_slli_epi16(y128[15],1); - - y128[16] = _mm_mulhi_epi16(y128[16],ONE_OVER_SQRT2_Q15_128); - y128[16] = _mm_slli_epi16(y128[16],1); - y128[17] = _mm_mulhi_epi16(y128[17],ONE_OVER_SQRT2_Q15_128); - y128[17] = _mm_slli_epi16(y128[17],1); - y128[18] = _mm_mulhi_epi16(y128[18],ONE_OVER_SQRT2_Q15_128); - y128[18] = _mm_slli_epi16(y128[18],1); - y128[19] = _mm_mulhi_epi16(y128[19],ONE_OVER_SQRT2_Q15_128); - y128[19] = _mm_slli_epi16(y128[19],1); - y128[20] = _mm_mulhi_epi16(y128[20],ONE_OVER_SQRT2_Q15_128); - y128[20] = _mm_slli_epi16(y128[20],1); - y128[21] = _mm_mulhi_epi16(y128[21],ONE_OVER_SQRT2_Q15_128); - y128[21] = _mm_slli_epi16(y128[21],1); - y128[22] = _mm_mulhi_epi16(y128[22],ONE_OVER_SQRT2_Q15_128); - y128[22] = _mm_slli_epi16(y128[22],1); - y128[23] = _mm_mulhi_epi16(y128[23],ONE_OVER_SQRT2_Q15_128); - y128[23] = _mm_slli_epi16(y128[23],1); - y128[24] = _mm_mulhi_epi16(y128[24],ONE_OVER_SQRT2_Q15_128); - y128[24] = _mm_slli_epi16(y128[24],1); - y128[25] = _mm_mulhi_epi16(y128[25],ONE_OVER_SQRT2_Q15_128); - y128[25] = _mm_slli_epi16(y128[25],1); - y128[26] = _mm_mulhi_epi16(y128[26],ONE_OVER_SQRT2_Q15_128); - y128[26] = _mm_slli_epi16(y128[26],1); - y128[27] = _mm_mulhi_epi16(y128[27],ONE_OVER_SQRT2_Q15_128); - y128[27] = _mm_slli_epi16(y128[27],1); - y128[28] = _mm_mulhi_epi16(y128[28],ONE_OVER_SQRT2_Q15_128); - y128[28] = _mm_slli_epi16(y128[28],1); - y128[29] = _mm_mulhi_epi16(y128[29],ONE_OVER_SQRT2_Q15_128); - y128[29] = _mm_slli_epi16(y128[29],1); - y128[30] = _mm_mulhi_epi16(y128[30],ONE_OVER_SQRT2_Q15_128); - y128[30] = _mm_slli_epi16(y128[30],1); - y128[31] = _mm_mulhi_epi16(y128[31],ONE_OVER_SQRT2_Q15_128); - y128[31] = _mm_slli_epi16(y128[31],1); + y128[0] = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128); + y128[1] = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128); + y128[2] = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128); + y128[3] = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128); + y128[4] = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128); + y128[5] = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128); + y128[6] = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128); + y128[7] = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128); + y128[8] = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128); + y128[9] = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128); + y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128); + y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128); + y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128); + y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128); + y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128); + y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128); + y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128); + y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128); + y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128); + y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128); + y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128); + y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128); + y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128); + y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128); + y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128); + y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128); + y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128); + y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128); + y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128); + y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128); + y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128); + y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128); + } @@ -1399,18 +2011,12 @@ void dft128(int16_t *x,int16_t *y,int scale) void idft128(int16_t *x,int16_t *y,int scale) { - __m64 xtmp[64],*x64 = (__m64 *)x; - __m128i ytmp[32],*tw128_128p=(__m128i *)tw128,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simdshort_q15_t xtmp[64],*x64 = (simdshort_q15_t *)x; + simd_q15_t ytmp[32],*tw128_128p=(simd_q15_t *)tw128,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i; - __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15); + simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15); + transpose4_ooff(x64 ,xtmp,32); transpose4_ooff(x64+2,xtmp+1,32); @@ -1460,71 +2066,38 @@ void idft128(int16_t *x,int16_t *y,int scale) if (scale>0) { - y128[0] = _mm_mulhi_epi16(y128[0],ONE_OVER_SQRT2_Q15_128); - y128[0] = _mm_slli_epi16(y128[0],1); - y128[1] = _mm_mulhi_epi16(y128[1],ONE_OVER_SQRT2_Q15_128); - y128[1] = _mm_slli_epi16(y128[1],1); - y128[2] = _mm_mulhi_epi16(y128[2],ONE_OVER_SQRT2_Q15_128); - y128[2] = _mm_slli_epi16(y128[2],1); - y128[3] = _mm_mulhi_epi16(y128[3],ONE_OVER_SQRT2_Q15_128); - y128[3] = _mm_slli_epi16(y128[3],1); - y128[4] = _mm_mulhi_epi16(y128[4],ONE_OVER_SQRT2_Q15_128); - y128[4] = _mm_slli_epi16(y128[4],1); - y128[5] = _mm_mulhi_epi16(y128[5],ONE_OVER_SQRT2_Q15_128); - y128[5] = _mm_slli_epi16(y128[5],1); - y128[6] = _mm_mulhi_epi16(y128[6],ONE_OVER_SQRT2_Q15_128); - y128[6] = _mm_slli_epi16(y128[6],1); - y128[7] = _mm_mulhi_epi16(y128[7],ONE_OVER_SQRT2_Q15_128); - y128[7] = _mm_slli_epi16(y128[7],1); - y128[8] = _mm_mulhi_epi16(y128[8],ONE_OVER_SQRT2_Q15_128); - y128[8] = _mm_slli_epi16(y128[8],1); - y128[9] = _mm_mulhi_epi16(y128[9],ONE_OVER_SQRT2_Q15_128); - y128[9] = _mm_slli_epi16(y128[9],1); - y128[10] = _mm_mulhi_epi16(y128[10],ONE_OVER_SQRT2_Q15_128); - y128[10] = _mm_slli_epi16(y128[10],1); - y128[11] = _mm_mulhi_epi16(y128[11],ONE_OVER_SQRT2_Q15_128); - y128[11] = _mm_slli_epi16(y128[11],1); - y128[12] = _mm_mulhi_epi16(y128[12],ONE_OVER_SQRT2_Q15_128); - y128[12] = _mm_slli_epi16(y128[12],1); - y128[13] = _mm_mulhi_epi16(y128[13],ONE_OVER_SQRT2_Q15_128); - y128[13] = _mm_slli_epi16(y128[13],1); - y128[14] = _mm_mulhi_epi16(y128[14],ONE_OVER_SQRT2_Q15_128); - y128[14] = _mm_slli_epi16(y128[14],1); - y128[15] = _mm_mulhi_epi16(y128[15],ONE_OVER_SQRT2_Q15_128); - y128[15] = _mm_slli_epi16(y128[15],1); - - y128[16] = _mm_mulhi_epi16(y128[16],ONE_OVER_SQRT2_Q15_128); - y128[16] = _mm_slli_epi16(y128[16],1); - y128[17] = _mm_mulhi_epi16(y128[17],ONE_OVER_SQRT2_Q15_128); - y128[17] = _mm_slli_epi16(y128[17],1); - y128[18] = _mm_mulhi_epi16(y128[18],ONE_OVER_SQRT2_Q15_128); - y128[18] = _mm_slli_epi16(y128[18],1); - y128[19] = _mm_mulhi_epi16(y128[19],ONE_OVER_SQRT2_Q15_128); - y128[19] = _mm_slli_epi16(y128[19],1); - y128[20] = _mm_mulhi_epi16(y128[20],ONE_OVER_SQRT2_Q15_128); - y128[20] = _mm_slli_epi16(y128[20],1); - y128[21] = _mm_mulhi_epi16(y128[21],ONE_OVER_SQRT2_Q15_128); - y128[21] = _mm_slli_epi16(y128[21],1); - y128[22] = _mm_mulhi_epi16(y128[22],ONE_OVER_SQRT2_Q15_128); - y128[22] = _mm_slli_epi16(y128[22],1); - y128[23] = _mm_mulhi_epi16(y128[23],ONE_OVER_SQRT2_Q15_128); - y128[23] = _mm_slli_epi16(y128[23],1); - y128[24] = _mm_mulhi_epi16(y128[24],ONE_OVER_SQRT2_Q15_128); - y128[24] = _mm_slli_epi16(y128[24],1); - y128[25] = _mm_mulhi_epi16(y128[25],ONE_OVER_SQRT2_Q15_128); - y128[25] = _mm_slli_epi16(y128[25],1); - y128[26] = _mm_mulhi_epi16(y128[26],ONE_OVER_SQRT2_Q15_128); - y128[26] = _mm_slli_epi16(y128[26],1); - y128[27] = _mm_mulhi_epi16(y128[27],ONE_OVER_SQRT2_Q15_128); - y128[27] = _mm_slli_epi16(y128[27],1); - y128[28] = _mm_mulhi_epi16(y128[28],ONE_OVER_SQRT2_Q15_128); - y128[28] = _mm_slli_epi16(y128[28],1); - y128[29] = _mm_mulhi_epi16(y128[29],ONE_OVER_SQRT2_Q15_128); - y128[29] = _mm_slli_epi16(y128[29],1); - y128[30] = _mm_mulhi_epi16(y128[30],ONE_OVER_SQRT2_Q15_128); - y128[30] = _mm_slli_epi16(y128[30],1); - y128[31] = _mm_mulhi_epi16(y128[31],ONE_OVER_SQRT2_Q15_128); - y128[31] = _mm_slli_epi16(y128[31],1); + y128[0] = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128); + y128[1] = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128); + y128[2] = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128); + y128[3] = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128); + y128[4] = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128); + y128[5] = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128); + y128[6] = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128); + y128[7] = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128); + y128[8] = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128); + y128[9] = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128); + y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128); + y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128); + y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128); + y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128); + y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128); + y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128); + y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128); + y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128); + y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128); + y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128); + y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128); + y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128); + y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128); + y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128); + y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128); + y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128); + y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128); + y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128); + y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128); + y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128); + y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128); + y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128); } @@ -1552,8 +2125,8 @@ int16_t tw256b[384] __attribute__((aligned(16))) = {0,32767,-805,32757,-1608,327 void dft256(int16_t *x,int16_t *y,int scale) { - __m128i xtmp[64],ytmp[64],*tw256a_128p=(__m128i *)tw256a,*tw256b_128p=(__m128i *)tw256b,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simd_q15_t xtmp[64],ytmp[64],*tw256a_128p=(simd_q15_t *)tw256a,*tw256b_128p=(simd_q15_t *)tw256b,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i; #ifdef D256STATS @@ -1632,22 +2205,22 @@ void dft256(int16_t *x,int16_t *y,int scale) if (scale>0) { for (i=0; i<4; i++) { - y128[0] = _mm_srai_epi16(y128[0],1); - y128[1] = _mm_srai_epi16(y128[1],1); - y128[2] = _mm_srai_epi16(y128[2],1); - y128[3] = _mm_srai_epi16(y128[3],1); - y128[4] = _mm_srai_epi16(y128[4],1); - y128[5] = _mm_srai_epi16(y128[5],1); - y128[6] = _mm_srai_epi16(y128[6],1); - y128[7] = _mm_srai_epi16(y128[7],1); - y128[8] = _mm_srai_epi16(y128[8],1); - y128[9] = _mm_srai_epi16(y128[9],1); - y128[10] = _mm_srai_epi16(y128[10],1); - y128[11] = _mm_srai_epi16(y128[11],1); - y128[12] = _mm_srai_epi16(y128[12],1); - y128[13] = _mm_srai_epi16(y128[13],1); - y128[14] = _mm_srai_epi16(y128[14],1); - y128[15] = _mm_srai_epi16(y128[15],1); + y128[0] = shiftright_int16(y128[0],1); + y128[1] = shiftright_int16(y128[1],1); + y128[2] = shiftright_int16(y128[2],1); + y128[3] = shiftright_int16(y128[3],1); + y128[4] = shiftright_int16(y128[4],1); + y128[5] = shiftright_int16(y128[5],1); + y128[6] = shiftright_int16(y128[6],1); + y128[7] = shiftright_int16(y128[7],1); + y128[8] = shiftright_int16(y128[8],1); + y128[9] = shiftright_int16(y128[9],1); + y128[10] = shiftright_int16(y128[10],1); + y128[11] = shiftright_int16(y128[11],1); + y128[12] = shiftright_int16(y128[12],1); + y128[13] = shiftright_int16(y128[13],1); + y128[14] = shiftright_int16(y128[14],1); + y128[15] = shiftright_int16(y128[15],1); y128+=16; } @@ -1664,8 +2237,8 @@ void dft256(int16_t *x,int16_t *y,int scale) void idft256(int16_t *x,int16_t *y,int scale) { - __m128i xtmp[64],ytmp[64],*tw256_128p=(__m128i *)tw256,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simd_q15_t xtmp[64],ytmp[64],*tw256_128p=(simd_q15_t *)tw256,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i,j; for (i=0,j=0; i<64; i+=4,j++) { @@ -1690,22 +2263,22 @@ void idft256(int16_t *x,int16_t *y,int scale) if (scale>0) { for (i=0; i<4; i++) { - y128[0] = _mm_srai_epi16(y128[0],1); - y128[1] = _mm_srai_epi16(y128[1],1); - y128[2] = _mm_srai_epi16(y128[2],1); - y128[3] = _mm_srai_epi16(y128[3],1); - y128[4] = _mm_srai_epi16(y128[4],1); - y128[5] = _mm_srai_epi16(y128[5],1); - y128[6] = _mm_srai_epi16(y128[6],1); - y128[7] = _mm_srai_epi16(y128[7],1); - y128[8] = _mm_srai_epi16(y128[8],1); - y128[9] = _mm_srai_epi16(y128[9],1); - y128[10] = _mm_srai_epi16(y128[10],1); - y128[11] = _mm_srai_epi16(y128[11],1); - y128[12] = _mm_srai_epi16(y128[12],1); - y128[13] = _mm_srai_epi16(y128[13],1); - y128[14] = _mm_srai_epi16(y128[14],1); - y128[15] = _mm_srai_epi16(y128[15],1); + y128[0] = shiftright_int16(y128[0],1); + y128[1] = shiftright_int16(y128[1],1); + y128[2] = shiftright_int16(y128[2],1); + y128[3] = shiftright_int16(y128[3],1); + y128[4] = shiftright_int16(y128[4],1); + y128[5] = shiftright_int16(y128[5],1); + y128[6] = shiftright_int16(y128[6],1); + y128[7] = shiftright_int16(y128[7],1); + y128[8] = shiftright_int16(y128[8],1); + y128[9] = shiftright_int16(y128[9],1); + y128[10] = shiftright_int16(y128[10],1); + y128[11] = shiftright_int16(y128[11],1); + y128[12] = shiftright_int16(y128[12],1); + y128[13] = shiftright_int16(y128[13],1); + y128[14] = shiftright_int16(y128[14],1); + y128[15] = shiftright_int16(y128[15],1); y128+=16; } @@ -1739,18 +2312,11 @@ int16_t tw512c[512] __attribute__((aligned(16))) = { void dft512(int16_t *x,int16_t *y,int scale) { - __m64 xtmp[256],*xtmpp,*x64 = (__m64 *)x; - __m128i ytmp[128],*tw512a_128p=(__m128i *)tw512a,*tw512b_128p=(__m128i *)tw512b,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simdshort_q15_t xtmp[256],*xtmpp,*x64 = (simdshort_q15_t *)x; + simd_q15_t ytmp[128],*tw512a_128p=(simd_q15_t *)tw512a,*tw512b_128p=(simd_q15_t *)tw512b,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i; - __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15); + simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15); xtmpp = xtmp; @@ -1838,38 +2404,22 @@ void dft512(int16_t *x,int16_t *y,int scale) y128p = y128; for (i=0; i<8; i++) { - y128p[0] = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[0] = _mm_slli_epi16(y128p[0],1); - y128p[1] = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[1] = _mm_slli_epi16(y128p[1],1); - y128p[2] = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[2] = _mm_slli_epi16(y128p[2],1); - y128p[3] = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[3] = _mm_slli_epi16(y128p[3],1); - y128p[4] = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[4] = _mm_slli_epi16(y128p[4],1); - y128p[5] = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[5] = _mm_slli_epi16(y128p[5],1); - y128p[6] = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[6] = _mm_slli_epi16(y128p[6],1); - y128p[7] = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[7] = _mm_slli_epi16(y128p[7],1); - y128p[8] = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[8] = _mm_slli_epi16(y128p[8],1); - y128p[9] = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[9] = _mm_slli_epi16(y128p[9],1); - y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[10] = _mm_slli_epi16(y128p[10],1); - y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[11] = _mm_slli_epi16(y128p[11],1); - y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[12] = _mm_slli_epi16(y128p[12],1); - y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[13] = _mm_slli_epi16(y128p[13],1); - y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[14] = _mm_slli_epi16(y128p[14],1); - y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p[15] = _mm_slli_epi16(y128p[15],1); + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); y128p+=16; } } @@ -1882,18 +2432,11 @@ void dft512(int16_t *x,int16_t *y,int scale) void idft512(int16_t *x,int16_t *y,int scale) { - __m64 xtmp[256],*xtmpp,*x64 = (__m64 *)x; - __m128i ytmp[128],*tw512_128p=(__m128i *)tw512,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simdshort_q15_t xtmp[256],*xtmpp,*x64 = (simdshort_q15_t *)x; + simd_q15_t ytmp[128],*tw512_128p=(simd_q15_t *)tw512,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i; - __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15); + simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15); xtmpp = xtmp; @@ -1951,38 +2494,22 @@ void idft512(int16_t *x,int16_t *y,int scale) y128p = y128; for (i=0; i<8; i++) { - y128p[0] = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[0] = _mm_slli_epi16(y128p[0],1); - y128p[1] = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[1] = _mm_slli_epi16(y128p[1],1); - y128p[2] = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[2] = _mm_slli_epi16(y128p[2],1); - y128p[3] = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[3] = _mm_slli_epi16(y128p[3],1); - y128p[4] = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[4] = _mm_slli_epi16(y128p[4],1); - y128p[5] = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[5] = _mm_slli_epi16(y128p[5],1); - y128p[6] = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[6] = _mm_slli_epi16(y128p[6],1); - y128p[7] = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[7] = _mm_slli_epi16(y128p[7],1); - y128p[8] = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[8] = _mm_slli_epi16(y128p[8],1); - y128p[9] = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[9] = _mm_slli_epi16(y128p[9],1); - y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[10] = _mm_slli_epi16(y128p[10],1); - y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[11] = _mm_slli_epi16(y128p[11],1); - y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[12] = _mm_slli_epi16(y128p[12],1); - y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[13] = _mm_slli_epi16(y128p[13],1); - y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[14] = _mm_slli_epi16(y128p[14],1); - y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p[15] = _mm_slli_epi16(y128p[15],1); + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); y128p+=16; } } @@ -2000,8 +2527,8 @@ int16_t tw1024[1536] __attribute__((aligned(16))) = { 32767,0,32766,-202,32764, void dft1024(int16_t *x,int16_t *y,int scale) { - __m128i xtmp[256],ytmp[256],*tw1024_128p=(__m128i *)tw1024,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simd_q15_t xtmp[256],ytmp[256],*tw1024_128p=(simd_q15_t *)tw1024,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i,j; for (i=0,j=0; i<256; i+=4,j++) { @@ -2026,22 +2553,22 @@ void dft1024(int16_t *x,int16_t *y,int scale) if (scale>0) { for (i=0; i<16; i++) { - y128[0] = _mm_srai_epi16(y128[0],1); - y128[1] = _mm_srai_epi16(y128[1],1); - y128[2] = _mm_srai_epi16(y128[2],1); - y128[3] = _mm_srai_epi16(y128[3],1); - y128[4] = _mm_srai_epi16(y128[4],1); - y128[5] = _mm_srai_epi16(y128[5],1); - y128[6] = _mm_srai_epi16(y128[6],1); - y128[7] = _mm_srai_epi16(y128[7],1); - y128[8] = _mm_srai_epi16(y128[8],1); - y128[9] = _mm_srai_epi16(y128[9],1); - y128[10] = _mm_srai_epi16(y128[10],1); - y128[11] = _mm_srai_epi16(y128[11],1); - y128[12] = _mm_srai_epi16(y128[12],1); - y128[13] = _mm_srai_epi16(y128[13],1); - y128[14] = _mm_srai_epi16(y128[14],1); - y128[15] = _mm_srai_epi16(y128[15],1); + y128[0] = shiftright_int16(y128[0],1); + y128[1] = shiftright_int16(y128[1],1); + y128[2] = shiftright_int16(y128[2],1); + y128[3] = shiftright_int16(y128[3],1); + y128[4] = shiftright_int16(y128[4],1); + y128[5] = shiftright_int16(y128[5],1); + y128[6] = shiftright_int16(y128[6],1); + y128[7] = shiftright_int16(y128[7],1); + y128[8] = shiftright_int16(y128[8],1); + y128[9] = shiftright_int16(y128[9],1); + y128[10] = shiftright_int16(y128[10],1); + y128[11] = shiftright_int16(y128[11],1); + y128[12] = shiftright_int16(y128[12],1); + y128[13] = shiftright_int16(y128[13],1); + y128[14] = shiftright_int16(y128[14],1); + y128[15] = shiftright_int16(y128[15],1); y128+=16; } @@ -2056,8 +2583,8 @@ void dft1024(int16_t *x,int16_t *y,int scale) void idft1024(int16_t *x,int16_t *y,int scale) { - __m128i xtmp[256],ytmp[256],*tw1024_128p=(__m128i *)tw1024,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simd_q15_t xtmp[256],ytmp[256],*tw1024_128p=(simd_q15_t *)tw1024,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i,j; for (i=0,j=0; i<256; i+=4,j++) { @@ -2082,22 +2609,22 @@ void idft1024(int16_t *x,int16_t *y,int scale) if (scale>0) { for (i=0; i<16; i++) { - y128[0] = _mm_srai_epi16(y128[0],1); - y128[1] = _mm_srai_epi16(y128[1],1); - y128[2] = _mm_srai_epi16(y128[2],1); - y128[3] = _mm_srai_epi16(y128[3],1); - y128[4] = _mm_srai_epi16(y128[4],1); - y128[5] = _mm_srai_epi16(y128[5],1); - y128[6] = _mm_srai_epi16(y128[6],1); - y128[7] = _mm_srai_epi16(y128[7],1); - y128[8] = _mm_srai_epi16(y128[8],1); - y128[9] = _mm_srai_epi16(y128[9],1); - y128[10] = _mm_srai_epi16(y128[10],1); - y128[11] = _mm_srai_epi16(y128[11],1); - y128[12] = _mm_srai_epi16(y128[12],1); - y128[13] = _mm_srai_epi16(y128[13],1); - y128[14] = _mm_srai_epi16(y128[14],1); - y128[15] = _mm_srai_epi16(y128[15],1); + y128[0] = shiftright_int16(y128[0],1); + y128[1] = shiftright_int16(y128[1],1); + y128[2] = shiftright_int16(y128[2],1); + y128[3] = shiftright_int16(y128[3],1); + y128[4] = shiftright_int16(y128[4],1); + y128[5] = shiftright_int16(y128[5],1); + y128[6] = shiftright_int16(y128[6],1); + y128[7] = shiftright_int16(y128[7],1); + y128[8] = shiftright_int16(y128[8],1); + y128[9] = shiftright_int16(y128[9],1); + y128[10] = shiftright_int16(y128[10],1); + y128[11] = shiftright_int16(y128[11],1); + y128[12] = shiftright_int16(y128[12],1); + y128[13] = shiftright_int16(y128[13],1); + y128[14] = shiftright_int16(y128[14],1); + y128[15] = shiftright_int16(y128[15],1); y128+=16; } @@ -2115,18 +2642,11 @@ int16_t tw2048[2048] __attribute__((aligned(16))) = {32767,0,32766,-101,32766,-2 void dft2048(int16_t *x,int16_t *y,int scale) { - __m64 xtmp[2048],*xtmpp,*x64 = (__m64 *)x; - __m128i ytmp[512],*tw2048_128p=(__m128i *)tw2048,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simdshort_q15_t xtmp[2048],*xtmpp,*x64 = (simdshort_q15_t *)x; + simd_q15_t ytmp[512],*tw2048_128p=(simd_q15_t *)tw2048,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i; - __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15); + simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15); xtmpp = xtmp; @@ -2184,38 +2704,22 @@ void dft2048(int16_t *x,int16_t *y,int scale) y128p = y128; for (i=0; i<32; i++) { - y128p[0] = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[0] = _mm_slli_epi16(y128p[0],1); - y128p[1] = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[1] = _mm_slli_epi16(y128p[1],1); - y128p[2] = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[2] = _mm_slli_epi16(y128p[2],1); - y128p[3] = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[3] = _mm_slli_epi16(y128p[3],1); - y128p[4] = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[4] = _mm_slli_epi16(y128p[4],1); - y128p[5] = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[5] = _mm_slli_epi16(y128p[5],1); - y128p[6] = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[6] = _mm_slli_epi16(y128p[6],1); - y128p[7] = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[7] = _mm_slli_epi16(y128p[7],1); - y128p[8] = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[8] = _mm_slli_epi16(y128p[8],1); - y128p[9] = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[9] = _mm_slli_epi16(y128p[9],1); - y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[10] = _mm_slli_epi16(y128p[10],1); - y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[11] = _mm_slli_epi16(y128p[11],1); - y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[12] = _mm_slli_epi16(y128p[12],1); - y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[13] = _mm_slli_epi16(y128p[13],1); - y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[14] = _mm_slli_epi16(y128p[14],1); - y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p[15] = _mm_slli_epi16(y128p[15],1); + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); y128p+=16; } } @@ -2228,18 +2732,11 @@ void dft2048(int16_t *x,int16_t *y,int scale) void idft2048(int16_t *x,int16_t *y,int scale) { - __m64 xtmp[2048],*xtmpp,*x64 = (__m64 *)x; - __m128i ytmp[512],*tw2048_128p=(__m128i *)tw2048,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simdshort_q15_t xtmp[2048],*xtmpp,*x64 = (simdshort_q15_t *)x; + simd_q15_t ytmp[512],*tw2048_128p=(simd_q15_t *)tw2048,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i; - __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15); + simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15); xtmpp = xtmp; @@ -2297,38 +2794,22 @@ void idft2048(int16_t *x,int16_t *y,int scale) y128p = y128; for (i=0; i<32; i++) { - y128p[0] = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[0] = _mm_slli_epi16(y128p[0],1); - y128p[1] = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[1] = _mm_slli_epi16(y128p[1],1); - y128p[2] = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[2] = _mm_slli_epi16(y128p[2],1); - y128p[3] = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[3] = _mm_slli_epi16(y128p[3],1); - y128p[4] = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[4] = _mm_slli_epi16(y128p[4],1); - y128p[5] = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[5] = _mm_slli_epi16(y128p[5],1); - y128p[6] = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[6] = _mm_slli_epi16(y128p[6],1); - y128p[7] = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[7] = _mm_slli_epi16(y128p[7],1); - y128p[8] = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[8] = _mm_slli_epi16(y128p[8],1); - y128p[9] = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[9] = _mm_slli_epi16(y128p[9],1); - y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[10] = _mm_slli_epi16(y128p[10],1); - y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[11] = _mm_slli_epi16(y128p[11],1); - y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[12] = _mm_slli_epi16(y128p[12],1); - y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[13] = _mm_slli_epi16(y128p[13],1); - y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[14] = _mm_slli_epi16(y128p[14],1); - y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p[15] = _mm_slli_epi16(y128p[15],1); + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); y128p+=16; } } @@ -2343,8 +2824,8 @@ void idft2048(int16_t *x,int16_t *y,int scale) void dft4096(int16_t *x,int16_t *y,int scale) { - __m128i xtmp[4096],ytmp[4096],*tw4096_128p=(__m128i *)tw4096,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simd_q15_t xtmp[4096],ytmp[4096],*tw4096_128p=(simd_q15_t *)tw4096,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i,j; for (i=0,j=0; i<1024; i+=4,j++) { @@ -2369,22 +2850,22 @@ void dft4096(int16_t *x,int16_t *y,int scale) if (scale>0) { for (i=0; i<64; i++) { - y128[0] = _mm_srai_epi16(y128[0],1); - y128[1] = _mm_srai_epi16(y128[1],1); - y128[2] = _mm_srai_epi16(y128[2],1); - y128[3] = _mm_srai_epi16(y128[3],1); - y128[4] = _mm_srai_epi16(y128[4],1); - y128[5] = _mm_srai_epi16(y128[5],1); - y128[6] = _mm_srai_epi16(y128[6],1); - y128[7] = _mm_srai_epi16(y128[7],1); - y128[8] = _mm_srai_epi16(y128[8],1); - y128[9] = _mm_srai_epi16(y128[9],1); - y128[10] = _mm_srai_epi16(y128[10],1); - y128[11] = _mm_srai_epi16(y128[11],1); - y128[12] = _mm_srai_epi16(y128[12],1); - y128[13] = _mm_srai_epi16(y128[13],1); - y128[14] = _mm_srai_epi16(y128[14],1); - y128[15] = _mm_srai_epi16(y128[15],1); + y128[0] = shiftright_int16(y128[0],1); + y128[1] = shiftright_int16(y128[1],1); + y128[2] = shiftright_int16(y128[2],1); + y128[3] = shiftright_int16(y128[3],1); + y128[4] = shiftright_int16(y128[4],1); + y128[5] = shiftright_int16(y128[5],1); + y128[6] = shiftright_int16(y128[6],1); + y128[7] = shiftright_int16(y128[7],1); + y128[8] = shiftright_int16(y128[8],1); + y128[9] = shiftright_int16(y128[9],1); + y128[10] = shiftright_int16(y128[10],1); + y128[11] = shiftright_int16(y128[11],1); + y128[12] = shiftright_int16(y128[12],1); + y128[13] = shiftright_int16(y128[13],1); + y128[14] = shiftright_int16(y128[14],1); + y128[15] = shiftright_int16(y128[15],1); y128+=16; } @@ -2399,8 +2880,8 @@ void dft4096(int16_t *x,int16_t *y,int scale) void idft4096(int16_t *x,int16_t *y,int scale) { - __m128i xtmp[4096],ytmp[4096],*tw4096_128p=(__m128i *)tw4096,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simd_q15_t xtmp[4096],ytmp[4096],*tw4096_128p=(simd_q15_t *)tw4096,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i,j; for (i=0,j=0; i<1024; i+=4,j++) { @@ -2425,22 +2906,22 @@ void idft4096(int16_t *x,int16_t *y,int scale) if (scale>0) { for (i=0; i<64; i++) { - y128[0] = _mm_srai_epi16(y128[0],1); - y128[1] = _mm_srai_epi16(y128[1],1); - y128[2] = _mm_srai_epi16(y128[2],1); - y128[3] = _mm_srai_epi16(y128[3],1); - y128[4] = _mm_srai_epi16(y128[4],1); - y128[5] = _mm_srai_epi16(y128[5],1); - y128[6] = _mm_srai_epi16(y128[6],1); - y128[7] = _mm_srai_epi16(y128[7],1); - y128[8] = _mm_srai_epi16(y128[8],1); - y128[9] = _mm_srai_epi16(y128[9],1); - y128[10] = _mm_srai_epi16(y128[10],1); - y128[11] = _mm_srai_epi16(y128[11],1); - y128[12] = _mm_srai_epi16(y128[12],1); - y128[13] = _mm_srai_epi16(y128[13],1); - y128[14] = _mm_srai_epi16(y128[14],1); - y128[15] = _mm_srai_epi16(y128[15],1); + y128[0] = shiftright_int16(y128[0],1); + y128[1] = shiftright_int16(y128[1],1); + y128[2] = shiftright_int16(y128[2],1); + y128[3] = shiftright_int16(y128[3],1); + y128[4] = shiftright_int16(y128[4],1); + y128[5] = shiftright_int16(y128[5],1); + y128[6] = shiftright_int16(y128[6],1); + y128[7] = shiftright_int16(y128[7],1); + y128[8] = shiftright_int16(y128[8],1); + y128[9] = shiftright_int16(y128[9],1); + y128[10] = shiftright_int16(y128[10],1); + y128[11] = shiftright_int16(y128[11],1); + y128[12] = shiftright_int16(y128[12],1); + y128[13] = shiftright_int16(y128[13],1); + y128[14] = shiftright_int16(y128[14],1); + y128[15] = shiftright_int16(y128[15],1); y128+=16; } @@ -2468,18 +2949,11 @@ static int16_t tw8192[4096*2] = {32767,0,32766,-26,32766,-51,32766,-76,32766,-10 void dft8192(int16_t *x,int16_t *y,int scale) { - __m64 xtmp[4096],*xtmpp,*x64 = (__m64 *)x; - __m128i ytmp[1024],*tw8192_128p=(__m128i *)tw8192,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simdshort_q15_t xtmp[4096],*xtmpp,*x64 = (simdshort_q15_t *)x; + simd_q15_t ytmp[1024],*tw8192_128p=(simd_q15_t *)tw8192,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i; - __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15); + simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15); xtmpp = xtmp; @@ -2537,38 +3011,22 @@ void dft8192(int16_t *x,int16_t *y,int scale) y128p = y128; for (i=0; i<128; i++) { - y128p[0] = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[0] = _mm_slli_epi16(y128p[0],1); - y128p[1] = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[1] = _mm_slli_epi16(y128p[1],1); - y128p[2] = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[2] = _mm_slli_epi16(y128p[2],1); - y128p[3] = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[3] = _mm_slli_epi16(y128p[3],1); - y128p[4] = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[4] = _mm_slli_epi16(y128p[4],1); - y128p[5] = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[5] = _mm_slli_epi16(y128p[5],1); - y128p[6] = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[6] = _mm_slli_epi16(y128p[6],1); - y128p[7] = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[7] = _mm_slli_epi16(y128p[7],1); - y128p[8] = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[8] = _mm_slli_epi16(y128p[8],1); - y128p[9] = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[9] = _mm_slli_epi16(y128p[9],1); - y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[10] = _mm_slli_epi16(y128p[10],1); - y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[11] = _mm_slli_epi16(y128p[11],1); - y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[12] = _mm_slli_epi16(y128p[12],1); - y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[13] = _mm_slli_epi16(y128p[13],1); - y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[14] = _mm_slli_epi16(y128p[14],1); - y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p[15] = _mm_slli_epi16(y128p[15],1); + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); y128p+=16; } } @@ -2581,18 +3039,11 @@ void dft8192(int16_t *x,int16_t *y,int scale) void idft8192(int16_t *x,int16_t *y,int scale) { - __m64 xtmp[4096],*xtmpp,*x64 = (__m64 *)x; - __m128i ytmp[2048],*tw8192_128p=(__m128i *)tw8192,*y128=(__m128i *)y,*y128p=(__m128i *)y; - __m128i *ytmpp = &ytmp[0]; + simdshort_q15_t xtmp[4096],*xtmpp,*x64 = (simdshort_q15_t *)x; + simd_q15_t ytmp[2048],*tw8192_128p=(simd_q15_t *)tw8192,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y; + simd_q15_t *ytmpp = &ytmp[0]; int i; - __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15, - ONE_OVER_SQRT2_Q15); + simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15); xtmpp = xtmp; @@ -2650,38 +3101,22 @@ void idft8192(int16_t *x,int16_t *y,int scale) y128p = y128; for (i=0; i<128; i++) { - y128p[0] = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128); - y128p[0] = _mm_slli_epi16(y128p[0],1); - y128p[1] = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128); - y128p[1] = _mm_slli_epi16(y128p[1],1); - y128p[2] = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128); - y128p[2] = _mm_slli_epi16(y128p[2],1); - y128p[3] = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128); - y128p[3] = _mm_slli_epi16(y128p[3],1); - y128p[4] = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128); - y128p[4] = _mm_slli_epi16(y128p[4],1); - y128p[5] = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128); - y128p[5] = _mm_slli_epi16(y128p[5],1); - y128p[6] = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128); - y128p[6] = _mm_slli_epi16(y128p[6],1); - y128p[7] = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128); - y128p[7] = _mm_slli_epi16(y128p[7],1); - y128p[8] = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128); - y128p[8] = _mm_slli_epi16(y128p[8],1); - y128p[9] = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128); - y128p[9] = _mm_slli_epi16(y128p[9],1); - y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128); - y128p[10] = _mm_slli_epi16(y128p[10],1); - y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128); - y128p[11] = _mm_slli_epi16(y128p[11],1); - y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128); - y128p[12] = _mm_slli_epi16(y128p[12],1); - y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128); - y128p[13] = _mm_slli_epi16(y128p[13],1); - y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128); - y128p[14] = _mm_slli_epi16(y128p[14],1); - y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128); - y128p[15] = _mm_slli_epi16(y128p[15],1); + y128p[0] = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128); + y128p[1] = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128); + y128p[2] = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128); + y128p[3] = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128); + y128p[4] = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128); + y128p[5] = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128); + y128p[6] = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128); + y128p[7] = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128); + y128p[8] = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128); + y128p[9] = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128); + y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128); + y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128); + y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128); + y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128); + y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128); + y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128); y128p+=16; } } @@ -2721,9 +3156,9 @@ void idft1536(int16_t *input, int16_t *output) // write_output("out2.m","o2",tmpo[2],2048,1,1); for (i=0,i2=0; i<1024; i+=8,i2+=4) { - ibfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),((__m128i*)&tmpo[2][i2]), - (__m128i*)(output+i),(__m128i*)(output+1024+i),(__m128i*)(output+2048+i), - (__m128i*)(twa1536+i),(__m128i*)(twb1536+i)); + ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), + (simd_q15_t*)(output+i),(simd_q15_t*)(output+1024+i),(simd_q15_t*)(output+2048+i), + (simd_q15_t*)(twa1536+i),(simd_q15_t*)(twb1536+i)); } @@ -2759,9 +3194,9 @@ void dft1536(int16_t *input, int16_t *output) // write_output("out1.m","o1",tmpo[1],2048,1,1); // write_output("out2.m","o2",tmpo[2],2048,1,1); for (i=0,i2=0; i<1024; i+=8,i2+=4) { - bfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),(__m128i*)(&tmpo[2][i2]), - (__m128i*)(output+i),(__m128i*)(output+1024+i),(__m128i*)(output+2048+i), - (__m128i*)(twa1536+i),(__m128i*)(twb1536+i)); + bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), + (simd_q15_t*)(output+i),(simd_q15_t*)(output+1024+i),(simd_q15_t*)(output+2048+i), + (simd_q15_t*)(twa1536+i),(simd_q15_t*)(twb1536+i)); } _mm_empty(); @@ -2811,9 +3246,9 @@ void idft6144(int16_t *input, int16_t *output) // write_output("out2.m","o2",tmpo[2],2048,1,1); for (i=0,i2=0; i<4096; i+=8,i2+=4) { - ibfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),((__m128i*)&tmpo[2][i2]), - (__m128i*)(output+i),(__m128i*)(output+4096+i),(__m128i*)(output+8192+i), - (__m128i*)(twa6144+i),(__m128i*)(twb6144+i)); + ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), + (simd_q15_t*)(output+i),(simd_q15_t*)(output+4096+i),(simd_q15_t*)(output+8192+i), + (simd_q15_t*)(twa6144+i),(simd_q15_t*)(twb6144+i)); } // write_output("out.m","out",output,6144,1,1); @@ -2850,9 +3285,9 @@ void dft6144(int16_t *input, int16_t *output) // write_output("out1.m","o1",tmpo[1],2048,1,1); // write_output("out2.m","o2",tmpo[2],2048,1,1); for (i=0,i2=0; i<4096; i+=8,i2+=4) { - bfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),(__m128i*)(&tmpo[2][i2]), - (__m128i*)(output+i),(__m128i*)(output+4096+i),(__m128i*)(output+8192+i), - (__m128i*)(twa6144+i),(__m128i*)(twb6144+i)); + bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), + (simd_q15_t*)(output+i),(simd_q15_t*)(output+4096+i),(simd_q15_t*)(output+8192+i), + (simd_q15_t*)(twa6144+i),(simd_q15_t*)(twb6144+i)); } _mm_empty(); @@ -2889,9 +3324,9 @@ void dft12288(int16_t *input, int16_t *output) // write_output("out1.m","o1",tmpo[1],4096,1,1); // write_output("out2.m","o2",tmpo[2],4096,1,1); for (i=0,i2=0; i<8192; i+=8,i2+=4) { - bfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),(__m128i*)(&tmpo[2][i2]), - (__m128i*)(output+i),(__m128i*)(output+8192+i),(__m128i*)(output+16384+i), - (__m128i*)(twa12288+i),(__m128i*)(twb12288+i)); + bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), + (simd_q15_t*)(output+i),(simd_q15_t*)(output+8192+i),(simd_q15_t*)(output+16384+i), + (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i)); } _mm_empty(); @@ -2922,9 +3357,9 @@ void idft12288(int16_t *input, int16_t *output) write_output("out2.m","o2",tmpo[2],4096,1,1); */ for (i=0,i2=0; i<8192; i+=8,i2+=4) { - ibfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),((__m128i*)&tmpo[2][i2]), - (__m128i*)(output+i),(__m128i*)(output+8192+i),(__m128i*)(output+16384+i), - (__m128i*)(twa12288+i),(__m128i*)(twb12288+i)); + ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), + (simd_q15_t*)(output+i),(simd_q15_t*)(output+8192+i),(simd_q15_t*)(output+16384+i), + (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i)); } _mm_empty(); @@ -2972,9 +3407,9 @@ void dft24576(int16_t *input, int16_t *output) // write_output("out1.m","o1",tmpo[1],8192,1,1); // write_output("out2.m","o2",tmpo[2],8192,1,1); for (i=0,i2=0; i<16384; i+=8,i2+=4) { - bfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),(__m128i*)(&tmpo[2][i2]), - (__m128i*)(output+i),(__m128i*)(output+16384+i),(__m128i*)(output+32768+i), - (__m128i*)(twa24576+i),(__m128i*)(twb24576+i)); + bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]), + (simd_q15_t*)(output+i),(simd_q15_t*)(output+16384+i),(simd_q15_t*)(output+32768+i), + (simd_q15_t*)(twa24576+i),(simd_q15_t*)(twb24576+i)); } _mm_empty(); @@ -3014,9 +3449,9 @@ void idft24576(int16_t *input, int16_t *output) */ for (i=0,i2=0; i<16384; i+=8,i2+=4) { - ibfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),((__m128i*)&tmpo[2][i2]), - (__m128i*)(output+i),(__m128i*)(output+16384+i),(__m128i*)(output+32768+i), - (__m128i*)(twa24576+i),(__m128i*)(twb24576+i)); + ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]), + (simd_q15_t*)(output+i),(simd_q15_t*)(output+16384+i),(simd_q15_t*)(output+32768+i), + (simd_q15_t*)(twa24576+i),(simd_q15_t*)(twb24576+i)); } _mm_empty(); @@ -3034,70 +3469,70 @@ static int16_t W3_12s[8]__attribute__((aligned(16))) = {0,-32767,0,-32767,0,-327 static int16_t W4_12s[8]__attribute__((aligned(16))) = {-16383,-28377,-16383,-28377,-16383,-28377,-16383,-28377}; static int16_t W6_12s[8]__attribute__((aligned(16))) = {-32767,0,-32767,0,-32767,0,-32767,0}; -__m128i *W1_12=(__m128i *)W1_12s; -__m128i *W2_12=(__m128i *)W2_12s; -__m128i *W3_12=(__m128i *)W3_12s; -__m128i *W4_12=(__m128i *)W4_12s; -__m128i *W6_12=(__m128i *)W6_12s; - - -static __m128i norm128; - -static inline void dft12f(__m128i *x0, - __m128i *x1, - __m128i *x2, - __m128i *x3, - __m128i *x4, - __m128i *x5, - __m128i *x6, - __m128i *x7, - __m128i *x8, - __m128i *x9, - __m128i *x10, - __m128i *x11, - __m128i *y0, - __m128i *y1, - __m128i *y2, - __m128i *y3, - __m128i *y4, - __m128i *y5, - __m128i *y6, - __m128i *y7, - __m128i *y8, - __m128i *y9, - __m128i *y10, - __m128i *y11) __attribute__((always_inline)); - -static inline void dft12f(__m128i *x0, - __m128i *x1, - __m128i *x2, - __m128i *x3, - __m128i *x4, - __m128i *x5, - __m128i *x6, - __m128i *x7, - __m128i *x8, - __m128i *x9, - __m128i *x10, - __m128i *x11, - __m128i *y0, - __m128i *y1, - __m128i *y2, - __m128i *y3, - __m128i *y4, - __m128i *y5, - __m128i *y6, - __m128i *y7, - __m128i *y8, - __m128i *y9, - __m128i *y10, - __m128i *y11) +simd_q15_t *W1_12=(simd_q15_t *)W1_12s; +simd_q15_t *W2_12=(simd_q15_t *)W2_12s; +simd_q15_t *W3_12=(simd_q15_t *)W3_12s; +simd_q15_t *W4_12=(simd_q15_t *)W4_12s; +simd_q15_t *W6_12=(simd_q15_t *)W6_12s; + + +static simd_q15_t norm128; + +static inline void dft12f(simd_q15_t *x0, + simd_q15_t *x1, + simd_q15_t *x2, + simd_q15_t *x3, + simd_q15_t *x4, + simd_q15_t *x5, + simd_q15_t *x6, + simd_q15_t *x7, + simd_q15_t *x8, + simd_q15_t *x9, + simd_q15_t *x10, + simd_q15_t *x11, + simd_q15_t *y0, + simd_q15_t *y1, + simd_q15_t *y2, + simd_q15_t *y3, + simd_q15_t *y4, + simd_q15_t *y5, + simd_q15_t *y6, + simd_q15_t *y7, + simd_q15_t *y8, + simd_q15_t *y9, + simd_q15_t *y10, + simd_q15_t *y11) __attribute__((always_inline)); + +static inline void dft12f(simd_q15_t *x0, + simd_q15_t *x1, + simd_q15_t *x2, + simd_q15_t *x3, + simd_q15_t *x4, + simd_q15_t *x5, + simd_q15_t *x6, + simd_q15_t *x7, + simd_q15_t *x8, + simd_q15_t *x9, + simd_q15_t *x10, + simd_q15_t *x11, + simd_q15_t *y0, + simd_q15_t *y1, + simd_q15_t *y2, + simd_q15_t *y3, + simd_q15_t *y4, + simd_q15_t *y5, + simd_q15_t *y6, + simd_q15_t *y7, + simd_q15_t *y8, + simd_q15_t *y9, + simd_q15_t *y10, + simd_q15_t *y11) { - __m128i tmp_dft12[12]; + simd_q15_t tmp_dft12[12]; - __m128i *tmp_dft12_ptr = &tmp_dft12[0]; + simd_q15_t *tmp_dft12_ptr = &tmp_dft12[0]; // msg("dft12\n"); @@ -3171,22 +3606,6 @@ static inline void dft12f(__m128i *x0, y11, W3_12, W6_12); - /* - norm128 = _mm_set1_epi16(dft_norm_table[0]); - - *y0 = _mm_slli_epi16(_mm_mulhi_epi16(*y0,norm128),1); - *y1 = _mm_slli_epi16(_mm_mulhi_epi16(*y1,norm128),1); - *y2 = _mm_slli_epi16(_mm_mulhi_epi16(*y2,norm128),1); - *y3 = _mm_slli_epi16(_mm_mulhi_epi16(*y3,norm128),1); - *y4 = _mm_slli_epi16(_mm_mulhi_epi16(*y4,norm128),1); - *y5 = _mm_slli_epi16(_mm_mulhi_epi16(*y5,norm128),1); - *y6 = _mm_slli_epi16(_mm_mulhi_epi16(*y6,norm128),1); - *y7 = _mm_slli_epi16(_mm_mulhi_epi16(*y7,norm128),1); - *y8 = _mm_slli_epi16(_mm_mulhi_epi16(*y8,norm128),1); - *y9 = _mm_slli_epi16(_mm_mulhi_epi16(*y9,norm128),1); - *y10 = _mm_slli_epi16(_mm_mulhi_epi16(*y10,norm128),1); - *y11 = _mm_slli_epi16(_mm_mulhi_epi16(*y11,norm128),1); - */ } @@ -3196,7 +3615,7 @@ static inline void dft12f(__m128i *x0, void dft12(int16_t *x,int16_t *y) { - __m128i *x128 = (__m128i *)x,*y128 = (__m128i *)y; + simd_q15_t *x128 = (simd_q15_t *)x,*y128 = (simd_q15_t *)y; dft12f(&x128[0], &x128[1], &x128[2], @@ -3240,18 +3659,18 @@ static int16_t tw24[88]__attribute__((aligned(16))) = {31650,-8480,31650,-8480,3 -31650,-8480,-31650,-8480,-31650,-8480,-31650,-8480 }; -//static __m128i ytmp128array[300]; -//static __m128i ytmp128array2[300]; -//static __m128i ytmp128array3[300]; -//static __m128i x2128array[300]; +//static simd_q15_t ytmp128array[300]; +//static simd_q15_t ytmp128array2[300]; +//static simd_q15_t ytmp128array3[300]; +//static simd_q15_t x2128array[300]; void dft24(int16_t *x,int16_t *y,unsigned char scale_flag) { - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *tw128=(__m128i *)&tw24[0]; - __m128i ytmp128[24];//=&ytmp128array[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *tw128=(simd_q15_t *)&tw24[0]; + simd_q15_t ytmp128[24];//=&ytmp128array[0]; int i,j,k; // msg("dft24\n"); @@ -3326,10 +3745,10 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[1]); + norm128 = set1_int16(dft_norm_table[1]); for (i=0; i<24; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -3367,11 +3786,11 @@ static int16_t twb36[88]__attribute__((aligned(16))) = {30790,-11206,30790,-1120 void dft36(int16_t *x,int16_t *y,unsigned char scale_flag) { - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa36[0]; - __m128i *twb128=(__m128i *)&twb36[0]; - __m128i ytmp128[36];//&ytmp128array[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa36[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb36[0]; + simd_q15_t ytmp128[36];//&ytmp128array[0]; int i,j,k; @@ -3472,10 +3891,10 @@ void dft36(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[2]); + norm128 = set1_int16(dft_norm_table[2]); for (i=0; i<36; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -3526,12 +3945,12 @@ static int16_t twc48[88]__attribute__((aligned(16))) = {30272,-12539,30272,-1253 void dft48(int16_t *x, int16_t *y,unsigned char scale_flag) { - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa48[0]; - __m128i *twb128=(__m128i *)&twb48[0]; - __m128i *twc128=(__m128i *)&twc48[0]; - __m128i ytmp128[48];//=&ytmp128array[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa48[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb48[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc48[0]; + simd_q15_t ytmp128[48];//=&ytmp128array[0]; int i,j,k; @@ -3668,10 +4087,10 @@ void dft48(int16_t *x, int16_t *y,unsigned char scale_flag) } if (scale_flag == 1) { - norm128 = _mm_set1_epi16(dft_norm_table[3]); + norm128 = set1_int16(dft_norm_table[3]); for (i=0; i<48; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -3732,13 +4151,13 @@ static int16_t twd60[88]__attribute__((aligned(16))) = {29934,-13327,29934,-1332 void dft60(int16_t *x,int16_t *y,unsigned char scale) { - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa60[0]; - __m128i *twb128=(__m128i *)&twb60[0]; - __m128i *twc128=(__m128i *)&twc60[0]; - __m128i *twd128=(__m128i *)&twd60[0]; - __m128i ytmp128[60];//=&ytmp128array[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa60[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb60[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc60[0]; + simd_q15_t *twd128=(simd_q15_t *)&twd60[0]; + simd_q15_t ytmp128[60];//=&ytmp128array[0]; int i,j,k; dft12f(x128, @@ -3896,10 +4315,10 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale) } if (scale == 1) { - norm128 = _mm_set1_epi16(dft_norm_table[4]); + norm128 = set1_int16(dft_norm_table[4]); for (i=0; i<60; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -3949,12 +4368,12 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *tw128=(__m128i *)&tw72[0]; - __m128i x2128[72];// = (__m128i *)&x2128array[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *tw128=(simd_q15_t *)&tw72[0]; + simd_q15_t x2128[72];// = (simd_q15_t *)&x2128array[0]; - __m128i ytmp128[72];//=&ytmp128array2[0]; + simd_q15_t ytmp128[72];//=&ytmp128array2[0]; for (i=0,j=0; i<36; i++,j+=2) { x2128[i] = x128[j]; // even inputs @@ -3975,10 +4394,10 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[5]); + norm128 = set1_int16(dft_norm_table[5]); for (i=0; i<72; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -4041,11 +4460,11 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag) int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *tw128=(__m128i *)&tw96[0]; - __m128i x2128[96];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[96];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *tw128=(simd_q15_t *)&tw96[0]; + simd_q15_t x2128[96];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[96];//=&ytmp128array2[0]; for (i=0,j=0; i<48; i++,j+=2) { @@ -4068,10 +4487,10 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[6]); + norm128 = set1_int16(dft_norm_table[6]); for (i=0; i<96; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -4159,12 +4578,12 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag) int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa108[0]; - __m128i *twb128=(__m128i *)&twb108[0]; - __m128i x2128[108];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[108];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa108[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb108[0]; + simd_q15_t x2128[108];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[108];//=&ytmp128array2[0]; for (i=0,j=0; i<36; i++,j+=3) { @@ -4192,10 +4611,10 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[7]); + norm128 = set1_int16(dft_norm_table[7]); for (i=0; i<108; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -4270,11 +4689,11 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag) int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *tw128=(__m128i *)&tw120[0]; - __m128i x2128[120];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[120];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *tw128=(simd_q15_t *)&tw120[0]; + simd_q15_t x2128[120];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[120];//=&ytmp128array2[0]; for (i=0,j=0; i<60; i++,j+=2) { @@ -4297,10 +4716,10 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[8]); + norm128 = set1_int16(dft_norm_table[8]); for (i=0; i<120; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -4411,12 +4830,12 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa144[0]; - __m128i *twb128=(__m128i *)&twb144[0]; - __m128i x2128[144];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[144];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa144[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb144[0]; + simd_q15_t x2128[144];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[144];//=&ytmp128array2[0]; @@ -4444,10 +4863,10 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[9]); + norm128 = set1_int16(dft_norm_table[9]); for (i=0; i<144; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -4582,12 +5001,12 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag) { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa180[0]; - __m128i *twb128=(__m128i *)&twb180[0]; - __m128i x2128[180];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[180];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa180[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb180[0]; + simd_q15_t x2128[180];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[180];//=&ytmp128array2[0]; @@ -4615,10 +5034,10 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[10]); + norm128 = set1_int16(dft_norm_table[10]); for (i=0; i<180; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -4778,13 +5197,13 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa192[0]; - __m128i *twb128=(__m128i *)&twb192[0]; - __m128i *twc128=(__m128i *)&twc192[0]; - __m128i x2128[192];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[192];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa192[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb192[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc192[0]; + simd_q15_t x2128[192];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[192];//=&ytmp128array2[0]; @@ -4817,10 +5236,10 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[11]); + norm128 = set1_int16(dft_norm_table[11]); for (i=0; i<192; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -4979,12 +5398,12 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag) { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa216[0]; - __m128i *twb128=(__m128i *)&twb216[0]; - __m128i x2128[216];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[216];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa216[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb216[0]; + simd_q15_t x2128[216];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[216];//=&ytmp128array3[0]; @@ -5012,10 +5431,10 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[12]); + norm128 = set1_int16(dft_norm_table[12]); for (i=0; i<216; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -5211,13 +5630,13 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa240[0]; - __m128i *twb128=(__m128i *)&twb240[0]; - __m128i *twc128=(__m128i *)&twc240[0]; - __m128i x2128[240];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[240];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa240[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb240[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc240[0]; + simd_q15_t x2128[240];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[240];//=&ytmp128array2[0]; @@ -5250,10 +5669,10 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[13]); + norm128 = set1_int16(dft_norm_table[13]); for (i=0; i<240; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -5472,12 +5891,12 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag) { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa288[0]; - __m128i *twb128=(__m128i *)&twb288[0]; - __m128i x2128[288];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[288];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa288[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb288[0]; + simd_q15_t x2128[288];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[288];//=&ytmp128array3[0]; @@ -5505,10 +5924,10 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<288; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -5765,14 +6184,14 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa300[0]; - __m128i *twb128=(__m128i *)&twb300[0]; - __m128i *twc128=(__m128i *)&twc300[0]; - __m128i *twd128=(__m128i *)&twd300[0]; - __m128i x2128[300];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[300];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa300[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb300[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc300[0]; + simd_q15_t *twd128=(simd_q15_t *)&twd300[0]; + simd_q15_t x2128[300];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[300];//=&ytmp128array2[0]; @@ -5810,10 +6229,10 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[15]); + norm128 = set1_int16(dft_norm_table[15]); for (i=0; i<300; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -6066,12 +6485,12 @@ static int16_t twb324[107*2*4] = {32742,-1271,32742,-1271,32742,-1271,32742,-127 void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa324[0]; - __m128i *twb128=(__m128i *)&twb324[0]; - __m128i x2128[324];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[324];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa324[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb324[0]; + simd_q15_t x2128[324];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[324];//=&ytmp128array3[0]; @@ -6099,10 +6518,10 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 3 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<324; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -6380,12 +6799,12 @@ static int16_t twb360[119*2*4] = {32747,-1144,32747,-1144,32747,-1144,32747,-114 void dft360(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 3 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa360[0]; - __m128i *twb128=(__m128i *)&twb360[0]; - __m128i x2128[360];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[360];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa360[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb360[0]; + simd_q15_t x2128[360];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[360];//=&ytmp128array3[0]; @@ -6413,10 +6832,10 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 3 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<360; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -6752,13 +7171,13 @@ static int16_t twc384[95*2*4] = {32727,-1608,32727,-1608,32727,-1608,32727,-1608 void dft384(int16_t *x,int16_t *y,unsigned char scale_flag) // 96 x 4 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa384[0]; - __m128i *twb128=(__m128i *)&twb384[0]; - __m128i *twc128=(__m128i *)&twc384[0]; - __m128i x2128[384];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[384];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa384[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb384[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc384[0]; + simd_q15_t x2128[384];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[384];//=&ytmp128array2[0]; @@ -6791,10 +7210,10 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag) // 96 x 4 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]); + norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<384; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -7167,13 +7586,13 @@ static int16_t twc432[107*2*4] = {32735,-1430,32735,-1430,32735,-1430,32735,-143 void dft432(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 4 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa432[0]; - __m128i *twb128=(__m128i *)&twb432[0]; - __m128i *twc128=(__m128i *)&twc432[0]; - __m128i x2128[432];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[432];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa432[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb432[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc432[0]; + simd_q15_t x2128[432];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[432];//=&ytmp128array2[0]; for (i=0,j=0; i<108; i++,j+=4) { @@ -7205,10 +7624,10 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag) // 108 x 4 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]); + norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<432; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -7616,13 +8035,13 @@ static int16_t twc480[119*2*4] = {32741,-1287,32741,-1287,32741,-1287,32741,-128 void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa480[0]; - __m128i *twb128=(__m128i *)&twb480[0]; - __m128i *twc128=(__m128i *)&twc480[0]; - __m128i x2128[480];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[480];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa480[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb480[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc480[0]; + simd_q15_t x2128[480];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[480];//=&ytmp128array2[0]; @@ -7655,10 +8074,10 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag) // 120 x 4 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]); + norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<480; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -8057,12 +8476,12 @@ static int16_t twb540[179*2*4] = {32758,-763,32758,-763,32758,-763,32758,-763, void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa540[0]; - __m128i *twb128=(__m128i *)&twb540[0]; - __m128i x2128[540];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[540];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa540[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb540[0]; + simd_q15_t x2128[540];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[540];//=&ytmp128array3[0]; @@ -8090,10 +8509,10 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 3 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<540; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -8516,12 +8935,12 @@ static int16_t twb576[191*2*4] = {32759,-715,32759,-715,32759,-715,32759,-715, void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa576[0]; - __m128i *twb128=(__m128i *)&twb576[0]; - __m128i x2128[576];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[576];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa576[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb576[0]; + simd_q15_t x2128[576];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[576];//=&ytmp128array3[0]; @@ -8550,10 +8969,10 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag) // 192 x 3 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<576; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -8879,11 +9298,11 @@ static int16_t twa600[299*2*4] = {32765,-344,32765,-344,32765,-344,32765,-344, void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *tw128=(__m128i *)&twa600[0]; - __m128i x2128[600];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[600];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *tw128=(simd_q15_t *)&twa600[0]; + simd_q15_t x2128[600];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[600];//=&ytmp128array2[0]; for (i=0,j=0; i<300; i++,j+=2) { @@ -8906,10 +9325,10 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 2 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(ONE_OVER_SQRT2_Q15); + norm128 = set1_int16(ONE_OVER_SQRT2_Q15); for (i=0; i<600; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -9380,12 +9799,12 @@ static int16_t twb648[215*2*4] = {32760,-636,32760,-636,32760,-636,32760,-636, void dft648(int16_t *x,int16_t *y,unsigned char scale_flag) // 216 x 3 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa648[0]; - __m128i *twb128=(__m128i *)&twb648[0]; - __m128i x2128[648];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[648];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa648[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb648[0]; + simd_q15_t x2128[648];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[648];//=&ytmp128array3[0]; @@ -9413,10 +9832,10 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag) // 216 x 3 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<648; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -10007,13 +10426,13 @@ static int16_t twc720[179*2*4] = {32755,-858,32755,-858,32755,-858,32755,-858, void dft720(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 4 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa720[0]; - __m128i *twb128=(__m128i *)&twb720[0]; - __m128i *twc128=(__m128i *)&twc720[0]; - __m128i x2128[720];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[720];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa720[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb720[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc720[0]; + simd_q15_t x2128[720];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[720];//=&ytmp128array2[0]; @@ -10046,10 +10465,10 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag) // 180 x 4 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]); + norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<720; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -10662,12 +11081,12 @@ static int16_t twb864[287*2*4] = {32763,-477,32763,-477,32763,-477,32763,-477, void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa864[0]; - __m128i *twb128=(__m128i *)&twb864[0]; - __m128i x2128[864];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[864];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa864[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb864[0]; + simd_q15_t x2128[864];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[864];//=&ytmp128array3[0]; @@ -10695,10 +11114,10 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 3 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<864; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -11335,12 +11754,12 @@ static int16_t twb900[299*2*4] = {32763,-458,32763,-458,32763,-458,32763,-458, void dft900(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 3 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa900[0]; - __m128i *twb128=(__m128i *)&twb900[0]; - __m128i x2128[900];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[900];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa900[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb900[0]; + simd_q15_t x2128[900];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[900];//=&ytmp128array3[0]; @@ -11368,10 +11787,10 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag) // 300 x 3 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<900; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -12141,13 +12560,13 @@ static int16_t twc960[239*2*4] = {32760,-644,32760,-644,32760,-644,32760,-644, void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa960[0]; - __m128i *twb128=(__m128i *)&twb960[0]; - __m128i *twc128=(__m128i *)&twc960[0]; - __m128i x2128[960];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[960];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa960[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb960[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc960[0]; + simd_q15_t x2128[960];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[960];//=&ytmp128array2[0]; @@ -12180,10 +12599,10 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag) // 240 x 4 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]); + norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<960; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -12868,12 +13287,12 @@ static int16_t twb972[323*2*4] = {32764,-424,32764,-424,32764,-424,32764,-424, void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa972[0]; - __m128i *twb128=(__m128i *)&twb972[0]; - __m128i x2128[972];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[972];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa972[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb972[0]; + simd_q15_t x2128[972];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[972];//=&ytmp128array3[0]; @@ -12901,10 +13320,10 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag) // 324 x 3 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<972; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -13661,12 +14080,12 @@ static int16_t twb1080[359*2*4] = {32764,-382,32764,-382,32764,-382,32764,-382, void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag) // 360 x 3 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa1080[0]; - __m128i *twb128=(__m128i *)&twb1080[0]; - __m128i x2128[1080];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[1080];//=&ytmp128array3[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa1080[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb1080[0]; + simd_q15_t x2128[1080];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[1080];//=&ytmp128array3[0]; @@ -13694,10 +14113,10 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag) // 360 x 3 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(dft_norm_table[14]); + norm128 = set1_int16(dft_norm_table[14]); for (i=0; i<1080; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -14611,13 +15030,13 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 4 { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa1152[0]; - __m128i *twb128=(__m128i *)&twb1152[0]; - __m128i *twc128=(__m128i *)&twc1152[0]; - __m128i x2128[1152];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[1152];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa1152[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb1152[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc1152[0]; + simd_q15_t x2128[1152];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[1152];//=&ytmp128array2[0]; @@ -14650,10 +15069,10 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag) // 288 x 4 } if (scale_flag==1) { - norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]); + norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<1152; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -15569,13 +15988,13 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) { int i,j; - __m128i *x128=(__m128i *)x; - __m128i *y128=(__m128i *)y; - __m128i *twa128=(__m128i *)&twa1200[0]; - __m128i *twb128=(__m128i *)&twb1200[0]; - __m128i *twc128=(__m128i *)&twc1200[0]; - __m128i x2128[1200];// = (__m128i *)&x2128array[0]; - __m128i ytmp128[1200];//=&ytmp128array2[0]; + simd_q15_t *x128=(simd_q15_t *)x; + simd_q15_t *y128=(simd_q15_t *)y; + simd_q15_t *twa128=(simd_q15_t *)&twa1200[0]; + simd_q15_t *twb128=(simd_q15_t *)&twb1200[0]; + simd_q15_t *twc128=(simd_q15_t *)&twc1200[0]; + simd_q15_t x2128[1200];// = (simd_q15_t *)&x2128array[0]; + simd_q15_t ytmp128[1200];//=&ytmp128array2[0]; @@ -15608,10 +16027,10 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag) } if (scale_flag==1) { - norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]); + norm128 = set1_int16(16384);//dft_norm_table[13]); for (i=0; i<1200; i++) { - y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1); + y128[i] = mulhi_int16(y128[i],norm128); } } @@ -15632,14 +16051,14 @@ int main(int argc, char**argv) time_stats_t ts; - __m128i x[2048],y[2048],tw0,tw1,tw2,tw3; + simd_q15_t x[2048],y[2048],tw0,tw1,tw2,tw3; int i; set_taus_seed(0); - /* + /* ((int16_t *)&tw0)[0] = 32767; ((int16_t *)&tw0)[1] = 0; ((int16_t *)&tw0)[2] = 32767; @@ -15677,8 +16096,13 @@ int main(int argc, char**argv) ((int16_t *)&tw3)[7] = 0; for (i=0;i<300;i++) { +#if defined(__x86_64__) || defined(__i386__) x[i] = _mm_set1_epi32(taus()); x[i] = _mm_srai_epi16(x[i],4); +#elif defined(__arm__) + x[i] = (int16x8_t)vdupq_n_s32(taus()); + x[i] = vshrq_n_s16(x[i],4); +#endif } bfly2_tw1(x,x+1,y,y+1); @@ -15693,7 +16117,6 @@ int main(int argc, char**argv) printf("3(%d,%d) (%d,%d) => (%d,%d) (%d,%d)\n",((int16_t*)&x[0])[0],((int16_t*)&x[0])[1],((int16_t*)&x[1])[0],((int16_t*)&x[1])[1],((int16_t*)&y[0])[6],((int16_t*)&y[0])[7],((int16_t*)&y[1])[6],((int16_t*)&y[1])[7]); bfly2(x,x+1,y,y+1, &tw0); - bfly3_tw1(x,x+1,x+2,y, y+1,y+2); printf("0(%d,%d) (%d,%d) (%d %d) => (%d,%d) (%d,%d) (%d %d)\n",((int16_t*)&x[0])[0],((int16_t*)&x[0])[1],((int16_t*)&x[1])[0],((int16_t*)&x[1])[1],((int16_t*)&x[2])[0],((int16_t*)&x[2])[1],((int16_t*)&y[0])[0],((int16_t*)&y[0])[1],((int16_t*)&y[1])[0],((int16_t*)&y[1])[1],((int16_t*)&y[2])[0],((int16_t*)&y[2])[1]); printf("1(%d,%d) (%d,%d) (%d %d) => (%d,%d) (%d,%d) (%d %d)\n",((int16_t*)&x[0])[0],((int16_t*)&x[0])[1],((int16_t*)&x[1])[0],((int16_t*)&x[1])[1],((int16_t*)&x[2])[0],((int16_t*)&x[2])[1],((int16_t*)&y[0])[2],((int16_t*)&y[0])[3],((int16_t*)&y[1])[2],((int16_t*)&y[1])[3],((int16_t*)&y[2])[2],((int16_t*)&y[2])[3]); @@ -15958,11 +16381,36 @@ int main(int argc, char**argv) printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&y[i])[0],((int16_t *)&y[i])[1],((int16_t*)&y[i])[2],((int16_t *)&y[i])[3],((int16_t*)&y[i])[4],((int16_t *)&y[i])[5],((int16_t*)&y[i])[6],((int16_t *)&y[i])[7]); printf("\n"); */ - for (i=0; i<128; i++) { - ((int16_t*)x)[i] = (int16_t)((taus()&0xffff))>>5; + memset((void*)&x[0],0,64*4); +/* + for (i=0; i<64; i+=4) { + ((int16_t*)x)[i<<1] = 1024; + ((int16_t*)x)[1+(i<<1)] = 0; + ((int16_t*)x)[2+(i<<1)] = 0; + ((int16_t*)x)[3+(i<<1)] = 1024; + ((int16_t*)x)[4+(i<<1)] = -1024; + ((int16_t*)x)[5+(i<<1)] = 0; + ((int16_t*)x)[6+(i<<1)] = 0; + ((int16_t*)x)[7+(i<<1)] = -1024; + } +*/ + for (i=0;i<64;i++) { + ((int16_t*)x)[i] = (int16_t)((taus()&0xffff))>>5; } - memset((void*)&y[0],0,64*4); +/* + dft16((int16_t *)x,(int16_t *)y); + printf("16-point\n"); + printf("X: "); + for (i=0;i<4;i++) + printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&x[i])[0],((int16_t *)&x[i])[1],((int16_t*)&x[i])[2],((int16_t *)&x[i])[3],((int16_t*)&x[i])[4],((int16_t*)&x[i])[5],((int16_t*)&x[i])[6],((int16_t*)&x[i])[7]); + printf("\nY:"); + + for (i=0;i<4;i++) + printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&y[i])[0],((int16_t *)&y[i])[1],((int16_t*)&y[i])[2],((int16_t *)&y[i])[3],((int16_t*)&y[i])[4],((int16_t *)&y[i])[5],((int16_t*)&y[i])[6],((int16_t *)&y[i])[7]); + printf("\n"); + exit(-1); +*/ dft64((int16_t *)x,(int16_t *)y,1); dft64((int16_t *)x,(int16_t *)y,1); dft64((int16_t *)x,(int16_t *)y,1); @@ -15976,8 +16424,11 @@ int main(int argc, char**argv) } printf("\n\n64-point (%f cycles)\n",(double)ts.diff/(double)ts.trials); + write_output("x64.m","x64",x,64,1,1); + write_output("y64.m","y64",y,64,1,1); - /*printf("X: "); +/* + printf("X: "); for (i=0;i<16;i++) printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&x[i])[0],((int16_t *)&x[i])[1],((int16_t*)&x[i])[2],((int16_t *)&x[i])[3],((int16_t*)&x[i])[4],((int16_t*)&x[i])[5],((int16_t*)&x[i])[6],((int16_t*)&x[i])[7]); printf("\nY:"); @@ -15985,11 +16436,17 @@ int main(int argc, char**argv) for (i=0;i<16;i++) printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&y[i])[0],((int16_t *)&y[i])[1],((int16_t*)&y[i])[2],((int16_t *)&y[i])[3],((int16_t*)&y[i])[4],((int16_t *)&y[i])[5],((int16_t*)&y[i])[6],((int16_t *)&y[i])[7]); printf("\n"); - */ + + idft64((int16_t*)y,(int16_t*)x,1); + printf("X: "); + for (i=0;i<16;i++) + printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&x[i])[0],((int16_t *)&x[i])[1],((int16_t*)&x[i])[2],((int16_t *)&x[i])[3],((int16_t*)&x[i])[4],((int16_t*)&x[i])[5],((int16_t*)&x[i])[6],((int16_t*)&x[i])[7]); + for (i=0; i<256; i++) { ((int16_t*)x)[i] = (int16_t)((taus()&0xffff))>>5; } - +*/ + memset((void*)&y[0],0,128*4); reset_meas(&ts); diff --git a/openair1/PHY/TOOLS/signal_energy.c b/openair1/PHY/TOOLS/signal_energy.c index 41fd6ca87b..39926a8315 100755 --- a/openair1/PHY/TOOLS/signal_energy.c +++ b/openair1/PHY/TOOLS/signal_energy.c @@ -28,19 +28,16 @@ *******************************************************************************/ #include "defs.h" -#ifndef EXPRESSMIMO_TARGET #include "PHY/sse_intrin.h" -#endif //EXPRESSMIMO_TARGET // Compute Energy of a complex signal vector, removing the DC component! // input : points to vector // length : length of vector in complex samples #define shift 4 -#define shift_DC 0 +//#define shift_DC 0 - -#ifndef EXPRESSMIMO_TARGET +#if defined(__x86_64__) || defined(__i386__) #ifdef LOCALIZATION int32_t subcarrier_energy(int32_t *input,uint32_t length, int32_t *subcarrier_energy, uint16_t rx_power_correction) { @@ -73,6 +70,7 @@ int32_t subcarrier_energy(int32_t *input,uint32_t length, int32_t *subcarrier_en return i; } #endif + int32_t signal_energy(int32_t *input,uint32_t length) { @@ -81,9 +79,6 @@ int32_t signal_energy(int32_t *input,uint32_t length) register __m64 mm0,mm1,mm2,mm3; __m64 *in = (__m64 *)input; -#ifdef MAIN - int16_t *printb; -#endif mm0 = _mm_setzero_si64();//pxor(mm0,mm0); mm3 = _mm_setzero_si64();//pxor(mm3,mm3); @@ -95,35 +90,14 @@ int32_t signal_energy(int32_t *input,uint32_t length) mm1 = _m_pmaddwd(mm1,mm1); mm1 = _m_psradi(mm1,shift);// shift any 32 bits blocs of the word by the value shift mm0 = _m_paddd(mm0,mm1);// add the two 64 bits words 4 bytes by 4 bytes - // temp2 = mm0; - // printf("%d %d\n",((int *)&temp2)[0],((int *)&temp2)[1]); - - - // printb = (int16_t *)&mm2; - // printf("mm2 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]); - - mm2 = _m_psrawi(mm2,shift_DC); + // mm2 = _m_psrawi(mm2,shift_DC); mm3 = _m_paddw(mm3,mm2);// add the two 64 bits words 2 bytes by 2 bytes - - // printb = (int16_t *)&mm3; - // printf("mm3 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]); - } - /* - #ifdef MAIN - printb = (int16_t *)&mm3; - printf("%d %d %d %d\n",printb[0],printb[1],printb[2],printb[3]); - #endif - */ mm1 = mm0; - mm0 = _m_psrlqi(mm0,32); - mm0 = _m_paddd(mm0,mm1); - temp = _m_to_int(mm0); - temp/=length; temp<<=shift; // this is the average of x^2 @@ -132,25 +106,11 @@ int32_t signal_energy(int32_t *input,uint32_t length) mm2 = _m_psrlqi(mm3,32); mm2 = _m_paddw(mm2,mm3); - mm2 = _m_pmaddwd(mm2,mm2); - temp2 = _m_to_int(mm2); - temp2/=(length*length); - - temp2<<=(2*shift_DC); -#ifdef MAIN - printf("E x^2 = %d\n",temp); -#endif + // temp2<<=(2*shift_DC); temp -= temp2; -#ifdef MAIN - printf("(E x)^2=%d\n",temp2); -#endif - _mm_empty(); - _m_empty(); - - return((temp>0)?temp:1); } @@ -214,6 +174,81 @@ int32_t signal_energy_nodc(int32_t *input,uint32_t length) return((temp>0)?temp:1); } +#elif defined(__arm__) + +int32_t signal_energy(int32_t *input,uint32_t length) +{ + + int32_t i; + int32_t temp,temp2; + register int32x4_t tmpE,tmpDC; + int32x2_t tmpE2,tmpDC2; + int16x4_t *in = (int16x4_t *)input; + + tmpE = vdupq_n_s32(0); + tmpDC = vdupq_n_s32(0); + + for (i=0; i<length>>1; i++) { + + tmpE = vqaddq_s32(tmpE,vshrq_n_s32(vmull_s16(*in,*in),shift)); + tmpDC = vaddw_s16(tmpDC,vshr_n_s16(*in++,shift_DC)); + + } + + tmpE2 = vpadd_s32(vget_low_s32(tmpE),vget_high_s32(tmpE)); + + temp=(vget_lane_s32(tmpE2,0)+vget_lane_s32(tmpE2,1))/length; + temp<<=shift; // this is the average of x^2 + + // now remove the DC component + + + tmpDC2 = vpadd_s32(vget_low_s32(tmpDC),vget_high_s32(tmpDC)); + + temp2=(vget_lane_s32(tmpDC2,0)+vget_lane_s32(tmpDC2,1))/(length*length); + + // temp2<<=(2*shift_DC); +#ifdef MAIN + printf("E x^2 = %d\n",temp); +#endif + temp -= temp2; +#ifdef MAIN + printf("(E x)^2=%d\n",temp2); +#endif + + return((temp>0)?temp:1); +} + +int32_t signal_energy_nodc(int32_t *input,uint32_t length) +{ + + int32_t i; + int32_t temp; + register int32x4_t tmpE; + int32x2_t tmpE2; + int16x4_t *in = (int16x4_t *)input; + + tmpE = vdupq_n_s32(0); + + for (i=0; i<length>>1; i++) { + + tmpE = vqaddq_s32(tmpE,vshrq_n_s32(vmull_s16(*in,*in),shift)); + + } + + tmpE2 = vpadd_s32(vget_low_s32(tmpE),vget_high_s32(tmpE)); + + temp=(vget_lane_s32(tmpE2,0)+vget_lane_s32(tmpE2,1))/length; + temp<<=shift; // this is the average of x^2 + +#ifdef MAIN + printf("E x^2 = %d\n",temp); +#endif + + return((temp>0)?temp:1); +} + +#endif double signal_energy_fp(double **s_re,double **s_im,uint32_t nb_antennas,uint32_t length,uint32_t offset) { @@ -243,13 +278,6 @@ double signal_energy_fp2(struct complex *s,uint32_t length) return(V/length); } -#else - -int32_t signal_energy(int32_t *input,uint32_t length) -{ -} - -#endif #ifdef MAIN #define LENGTH 256 diff --git a/openair1/PHY/TOOLS/time_meas.c b/openair1/PHY/TOOLS/time_meas.c index 615b437deb..c734db09af 100644 --- a/openair1/PHY/TOOLS/time_meas.c +++ b/openair1/PHY/TOOLS/time_meas.c @@ -33,16 +33,7 @@ // global var for openair performance profiler int opp_enabled = 0; -/* - double get_cpu_freq_GHz(void) { - time_stats_t ts; - reset_meas(&ts); - start_meas(&ts); - sleep(1); - stop_meas(&ts); - return (double)ts.diff/1000000000; - }*/ double get_cpu_freq_GHz(void) { diff --git a/openair1/PHY/TOOLS/time_meas.h b/openair1/PHY/TOOLS/time_meas.h index a72754c77c..2bd6dc489d 100644 --- a/openair1/PHY/TOOLS/time_meas.h +++ b/openair1/PHY/TOOLS/time_meas.h @@ -26,15 +26,19 @@ Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE *******************************************************************************/ -#ifdef OMP -#include <omp.h> -#endif #include <unistd.h> #include <math.h> +#include <stdint.h> +#include <time.h> +#include <errno.h> +#include <stdio.h> +#include <pthread.h> +#include <linux/kernel.h> +#include <linux/types.h> // global var to enable openair performance profiler extern int opp_enabled; - double cpu_freq_GHz; +#if defined(__x86_64__) || defined(__i386__) typedef struct { @@ -46,7 +50,18 @@ typedef struct { long long max; int trials; } time_stats_t; +#elif defined(__arm__) +typedef struct { + uint32_t in; + uint32_t diff_now; + uint32_t diff; + uint32_t p_time; /*!< \brief absolute process duration */ + uint32_t diff_square; /*!< \brief process duration square */ + uint32_t max; + int trials; +} time_stats_t; +#endif static inline void start_meas(time_stats_t *ts) __attribute__((always_inline)); static inline void stop_meas(time_stats_t *ts) __attribute__((always_inline)); @@ -74,12 +89,12 @@ static inline unsigned long long rdtsc_oai(void) } #elif defined(__arm__) -static inline unsigned long long rdtsc_oai(void) __attribute__((always_inline)); -static inline unsigned long long rdtsc_oai(void) +static inline uint32_t rdtsc_oai(void) __attribute__((always_inline)); +static inline uint32_t rdtsc_oai(void) { uint32_t r = 0; asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(r) ); - return (unsigned long long)r; + return r; } #endif @@ -88,17 +103,8 @@ static inline void start_meas(time_stats_t *ts) if (opp_enabled) { -#ifdef OMP - int tid; - - tid = omp_get_thread_num(); - - if (tid==0) -#endif - { ts->trials++; ts->in = rdtsc_oai(); - } } } @@ -108,24 +114,16 @@ static inline void stop_meas(time_stats_t *ts) if (opp_enabled) { long long out = rdtsc_oai(); -#ifdef OMP - int tid; - tid = omp_get_thread_num(); - - if (tid==0) -#endif - { ts->diff_now = (out-ts->in); ts->diff += (out-ts->in); /// process duration is the difference between two clock points ts->p_time = (out-ts->in); - ts->diff_square += pow((out-ts->in),2); + ts->diff_square += (out-ts->in)*(out-ts->in); if ((out-ts->in) > ts->max) ts->max = out-ts->in; - } } } @@ -159,11 +157,3 @@ static inline void copy_meas(time_stats_t *dst_ts,time_stats_t *src_ts) dst_ts->max=src_ts->max; } } - -/*static inline double get_mean_meas_us(time_stats_t *ts, double cpu_freq_GHz) { - - return (double) ts->diff/ts->trials/cpu_freq_GHz/1000.0; - - } -*/ - diff --git a/openair1/PHY/TOOLS/vars.h b/openair1/PHY/TOOLS/vars.h index 7a5b283a8e..f6edc6bb34 100644 --- a/openair1/PHY/TOOLS/vars.h +++ b/openair1/PHY/TOOLS/vars.h @@ -25,4 +25,4 @@ Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE - *******************************************************************************/ \ No newline at end of file + *******************************************************************************/ diff --git a/openair1/PHY/defs.h b/openair1/PHY/defs.h index 89fe4c2af2..e90f6ba435 100755 --- a/openair1/PHY/defs.h +++ b/openair1/PHY/defs.h @@ -65,28 +65,13 @@ //use msg in the real-time thread context #define msg_nrt printf //use msg_nrt in the non real-time context (for initialization, ...) -#ifdef EXPRESSMIMO_TARGET -#define malloc16(x) malloc(x) -#else //EXPRESSMIMO_TARGET #define malloc16(x) memalign(16,x) -#endif //EXPRESSMIMO_TARGET #define free16(y,x) free(y) #define bigmalloc malloc #define bigmalloc16 malloc16 #define openair_free(y,x) free((y)) #define PAGE_SIZE 4096 -#ifdef EXPRESSMIMO_TARGET -//! \brief Allocate \c size bytes of memory on the heap and zero it afterwards. -//! If no more memory is available, this function will terminate the program with an assertion error. -static inline void* malloc16_clear( size_t size ) -{ - void* ptr = malloc(size); - DevAssert(ptr); - memset( ptr, 0, size ); - return ptr; -} -#else //EXPRESSMIMO_TARGET //! \brief Allocate \c size bytes of memory on the heap with alignment 16 and zero it afterwards. //! If no more memory is available, this function will terminate the program with an assertion error. static inline void* malloc16_clear( size_t size ) @@ -96,7 +81,7 @@ static inline void* malloc16_clear( size_t size ) memset( ptr, 0, size ); return ptr; } -#endif //EXPRESSMIMO_TARGET + #define PAGE_MASK 0xfffff000 @@ -119,10 +104,6 @@ static inline void* malloc16_clear( size_t size ) /// suppress compiler warning for unused arguments #define UNUSED(x) (void)x; -#ifdef EXPRESSMIMO_TARGET -#define Zero_Buffer(x,y) Zero_Buffer_nommx(x,y) -#endif //EXPRESSMiMO_TARGET - #include "spec_defs_top.h" #include "impl_defs_top.h" diff --git a/openair1/SIMULATION/LTE_PHY/dlsim.c b/openair1/SIMULATION/LTE_PHY/dlsim.c index 5e6929e480..a9fb0fe94c 100644 --- a/openair1/SIMULATION/LTE_PHY/dlsim.c +++ b/openair1/SIMULATION/LTE_PHY/dlsim.c @@ -233,7 +233,7 @@ void do_OFDM_mod_l(mod_sym_t **txdataF, int32_t **txdata, uint16_t next_slot, LT int main(int argc, char **argv) { - char c; + int c; int k,i,aa,aarx,aatx; int s,Kr,Kr_bytes; @@ -347,12 +347,24 @@ int main(int argc, char **argv) LTE_DL_UE_HARQ_t *dlsch0_ue_harq; LTE_DL_eNB_HARQ_t *dlsch0_eNB_harq; uint8_t Kmimo; - + FILE *proc_fd = NULL; + char buf[64]; opp_enabled=1; // to enable the time meas - cpu_freq_GHz = (double)get_cpu_freq_GHz(); - +#if defined(__arm__) + proc_fd = fopen("/sys/devices/system/cpu/cpu4/cpufreq/cpuinfo_cur_freq", "r"); + if(!proc_fd) + printf("cannot open /sys/devices/system/cpu/cpu4/cpufreq/cpuinfo_cur_freq"); + else { + while(fgets(buf, 63, proc_fd)) + printf("%s", buf); + } + fclose(proc_fd); + cpu_freq_GHz = ((double)atof(buf))/1e6; +#else + cpu_freq_GHz = get_cpu_freq_GHz(); +#endif printf("Detected cpu_freq %f GHz\n",cpu_freq_GHz); //signal(SIGSEGV, handler); @@ -1989,7 +2001,8 @@ int main(int argc, char **argv) if (input_trch_file==0) { for (i=0; i<input_buffer_length0; i++) { - input_buffer0[k][i]= (unsigned char)(taus()&0xff); + //input_buffer0[k][i] = (unsigned char)(i&0xff); + input_buffer0[k][i] = (unsigned char)(taus()&0xff); } for (i=0; i<input_buffer_length1; i++) { @@ -2690,7 +2703,6 @@ PMI_FEEDBACK: write_output("txsigF1.m","txsF1", &PHY_vars_eNB->lte_eNB_common_vars.txdataF[eNB_id][1][subframe*nsymb*PHY_vars_eNB->lte_frame_parms.ofdm_symbol_size], nsymb*PHY_vars_eNB->lte_frame_parms.ofdm_symbol_size,1,1); } - tx_lev = 0; for (aa=0; aa<PHY_vars_eNB->lte_frame_parms.nb_antennas_tx; aa++) { diff --git a/openair1/SIMULATION/TOOLS/multipath_channel.c b/openair1/SIMULATION/TOOLS/multipath_channel.c index 65c8737c81..7a21db2a57 100644 --- a/openair1/SIMULATION/TOOLS/multipath_channel.c +++ b/openair1/SIMULATION/TOOLS/multipath_channel.c @@ -1,222 +1,222 @@ -/******************************************************************************* - OpenAirInterface - Copyright(c) 1999 - 2014 Eurecom - - OpenAirInterface is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - - OpenAirInterface is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OpenAirInterface.The full GNU General Public License is - included in this distribution in the file called "COPYING". If not, - see <http://www.gnu.org/licenses/>. - - Contact Information - OpenAirInterface Admin: openair_admin@eurecom.fr - OpenAirInterface Tech : openair_tech@eurecom.fr - OpenAirInterface Dev : openair4g-devel@eurecom.fr - - Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE - - *******************************************************************************/ -#include <stdio.h> -#include <stdlib.h> -#include <math.h> -#include <string.h> -#include "defs.h" -#include "SIMULATION/RF/defs.h" - -//#define DEBUG_CH -uint8_t multipath_channel_nosigconv(channel_desc_t *desc) -{ - - random_channel(desc,0); - return(1); -} - -#define CHANNEL_SSE -#ifdef CHANNEL_SSE -void multipath_channel(channel_desc_t *desc, - double **tx_sig_re, - double **tx_sig_im, - double **rx_sig_re, - double **rx_sig_im, - uint32_t length, - uint8_t keep_channel) -{ - - int i,ii,j,l; - int length1, length2, tail; - __m128d rx_tmp128_re_f,rx_tmp128_im_f,rx_tmp128_re,rx_tmp128_im, rx_tmp128_1,rx_tmp128_2,rx_tmp128_3,rx_tmp128_4,tx128_re,tx128_im,ch128_x,ch128_y,pathloss128; - - double path_loss = pow(10,desc->path_loss_dB/20); - int dd = abs(desc->channel_offset); - - pathloss128 = _mm_set1_pd(path_loss); - -#ifdef DEBUG_CH - printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length); -#endif - - if (keep_channel) { - // do nothing - keep channel - } else { - random_channel(desc,0); - } - - start_meas(&desc->convolution); - -#ifdef DEBUG_CH - - for (l = 0; l<(int)desc->channel_length; l++) { - printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y); - } - - printf("\n"); -#endif - - tail = ((int)length-dd)%2; - - if(tail) - length1 = ((int)length-dd)-1; - else - length1 = ((int)length-dd); - - length2 = length1/2; - - for (i=0; i<length2; i++) { // - for (ii=0; ii<desc->nb_rx; ii++) { - // rx_tmp.x = 0; - // rx_tmp.y = 0; - rx_tmp128_re_f = _mm_setzero_pd(); - rx_tmp128_im_f = _mm_setzero_pd(); - - for (j=0; j<desc->nb_tx; j++) { - for (l = 0; l<(int)desc->channel_length; l++) { - if ((i>=0) && (i-l)>=0) { //SIMD correct only if length1 > 2*channel_length...which is almost always satisfied - // tx.x = tx_sig_re[j][i-l]; - // tx.y = tx_sig_im[j][i-l]; - tx128_re = _mm_loadu_pd(&tx_sig_re[j][2*i-l]); // tx_sig_re[j][i-l+1], tx_sig_re[j][i-l] - tx128_im = _mm_loadu_pd(&tx_sig_im[j][2*i-l]); - } else { - //tx.x =0; - //tx.y =0; - tx128_re = _mm_setzero_pd(); - tx128_im = _mm_setzero_pd(); - } - - ch128_x = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].x); - ch128_y = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].y); - // rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y); - // rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y); - rx_tmp128_1 = _mm_mul_pd(tx128_re,ch128_x); - rx_tmp128_2 = _mm_mul_pd(tx128_re,ch128_y); - rx_tmp128_3 = _mm_mul_pd(tx128_im,ch128_x); - rx_tmp128_4 = _mm_mul_pd(tx128_im,ch128_y); - rx_tmp128_re = _mm_sub_pd(rx_tmp128_1,rx_tmp128_4); - rx_tmp128_im = _mm_add_pd(rx_tmp128_2,rx_tmp128_3); - rx_tmp128_re_f = _mm_add_pd(rx_tmp128_re_f,rx_tmp128_re); - rx_tmp128_im_f = _mm_add_pd(rx_tmp128_im_f,rx_tmp128_im); - } //l - } // j - - //rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss; - //rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss; - rx_tmp128_re_f = _mm_mul_pd(rx_tmp128_re_f,pathloss128); - rx_tmp128_im_f = _mm_mul_pd(rx_tmp128_im_f,pathloss128); - _mm_storeu_pd(&rx_sig_re[ii][2*i+dd],rx_tmp128_re_f); // max index: length-dd -1 + dd = length -1 - _mm_storeu_pd(&rx_sig_im[ii][2*i+dd],rx_tmp128_im_f); - /* - if ((ii==0)&&((i%32)==0)) { - printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]); - } - */ - //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]); - //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]); - - } // ii - } // i - - stop_meas(&desc->convolution); - -} - -#else -void multipath_channel(channel_desc_t *desc, - double **tx_sig_re, - double **tx_sig_im, - double **rx_sig_re, - double **rx_sig_im, - uint32_t length, - uint8_t keep_channel) -{ - - int i,ii,j,l; - struct complex rx_tmp,tx; - - double path_loss = pow(10,desc->path_loss_dB/20); - int dd; - dd = abs(desc->channel_offset); - -#ifdef DEBUG_CH - printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length); -#endif - - if (keep_channel) { - // do nothing - keep channel - } else { - random_channel(desc,0); - } - -#ifdef DEBUG_CH - - for (l = 0; l<(int)desc->channel_length; l++) { - printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y); - } - - printf("\n"); -#endif - - for (i=0; i<((int)length-dd); i++) { - for (ii=0; ii<desc->nb_rx; ii++) { - rx_tmp.x = 0; - rx_tmp.y = 0; - - for (j=0; j<desc->nb_tx; j++) { - for (l = 0; l<(int)desc->channel_length; l++) { - if ((i>=0) && (i-l)>=0) { - tx.x = tx_sig_re[j][i-l]; - tx.y = tx_sig_im[j][i-l]; - } else { - tx.x =0; - tx.y =0; - } - - rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y); - rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y); - } //l - } // j - - rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss; - rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss; - /* - if ((ii==0)&&((i%32)==0)) { - printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]); - } - */ - //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]); - //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]); - - } // ii - } // i -} -#endif - - +/******************************************************************************* + OpenAirInterface + Copyright(c) 1999 - 2014 Eurecom + + OpenAirInterface is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + + OpenAirInterface is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OpenAirInterface.The full GNU General Public License is + included in this distribution in the file called "COPYING". If not, + see <http://www.gnu.org/licenses/>. + + Contact Information + OpenAirInterface Admin: openair_admin@eurecom.fr + OpenAirInterface Tech : openair_tech@eurecom.fr + OpenAirInterface Dev : openair4g-devel@eurecom.fr + + Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE + + *******************************************************************************/ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <string.h> +#include "defs.h" +#include "SIMULATION/RF/defs.h" + +//#define DEBUG_CH +uint8_t multipath_channel_nosigconv(channel_desc_t *desc) +{ + + random_channel(desc,0); + return(1); +} + +//#define CHANNEL_SSE +#ifdef CHANNEL_SSE +void multipath_channel(channel_desc_t *desc, + double **tx_sig_re, + double **tx_sig_im, + double **rx_sig_re, + double **rx_sig_im, + uint32_t length, + uint8_t keep_channel) +{ + + int i,ii,j,l; + int length1, length2, tail; + __m128d rx_tmp128_re_f,rx_tmp128_im_f,rx_tmp128_re,rx_tmp128_im, rx_tmp128_1,rx_tmp128_2,rx_tmp128_3,rx_tmp128_4,tx128_re,tx128_im,ch128_x,ch128_y,pathloss128; + + double path_loss = pow(10,desc->path_loss_dB/20); + int dd = abs(desc->channel_offset); + + pathloss128 = _mm_set1_pd(path_loss); + +#ifdef DEBUG_CH + printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length); +#endif + + if (keep_channel) { + // do nothing - keep channel + } else { + random_channel(desc,0); + } + + start_meas(&desc->convolution); + +#ifdef DEBUG_CH + + for (l = 0; l<(int)desc->channel_length; l++) { + printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y); + } + + printf("\n"); +#endif + + tail = ((int)length-dd)%2; + + if(tail) + length1 = ((int)length-dd)-1; + else + length1 = ((int)length-dd); + + length2 = length1/2; + + for (i=0; i<length2; i++) { // + for (ii=0; ii<desc->nb_rx; ii++) { + // rx_tmp.x = 0; + // rx_tmp.y = 0; + rx_tmp128_re_f = _mm_setzero_pd(); + rx_tmp128_im_f = _mm_setzero_pd(); + + for (j=0; j<desc->nb_tx; j++) { + for (l = 0; l<(int)desc->channel_length; l++) { + if ((i>=0) && (i-l)>=0) { //SIMD correct only if length1 > 2*channel_length...which is almost always satisfied + // tx.x = tx_sig_re[j][i-l]; + // tx.y = tx_sig_im[j][i-l]; + tx128_re = _mm_loadu_pd(&tx_sig_re[j][2*i-l]); // tx_sig_re[j][i-l+1], tx_sig_re[j][i-l] + tx128_im = _mm_loadu_pd(&tx_sig_im[j][2*i-l]); + } else { + //tx.x =0; + //tx.y =0; + tx128_re = _mm_setzero_pd(); + tx128_im = _mm_setzero_pd(); + } + + ch128_x = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].x); + ch128_y = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].y); + // rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y); + // rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y); + rx_tmp128_1 = _mm_mul_pd(tx128_re,ch128_x); + rx_tmp128_2 = _mm_mul_pd(tx128_re,ch128_y); + rx_tmp128_3 = _mm_mul_pd(tx128_im,ch128_x); + rx_tmp128_4 = _mm_mul_pd(tx128_im,ch128_y); + rx_tmp128_re = _mm_sub_pd(rx_tmp128_1,rx_tmp128_4); + rx_tmp128_im = _mm_add_pd(rx_tmp128_2,rx_tmp128_3); + rx_tmp128_re_f = _mm_add_pd(rx_tmp128_re_f,rx_tmp128_re); + rx_tmp128_im_f = _mm_add_pd(rx_tmp128_im_f,rx_tmp128_im); + } //l + } // j + + //rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss; + //rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss; + rx_tmp128_re_f = _mm_mul_pd(rx_tmp128_re_f,pathloss128); + rx_tmp128_im_f = _mm_mul_pd(rx_tmp128_im_f,pathloss128); + _mm_storeu_pd(&rx_sig_re[ii][2*i+dd],rx_tmp128_re_f); // max index: length-dd -1 + dd = length -1 + _mm_storeu_pd(&rx_sig_im[ii][2*i+dd],rx_tmp128_im_f); + /* + if ((ii==0)&&((i%32)==0)) { + printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]); + } + */ + //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]); + //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]); + + } // ii + } // i + + stop_meas(&desc->convolution); + +} + +#else +void multipath_channel(channel_desc_t *desc, + double **tx_sig_re, + double **tx_sig_im, + double **rx_sig_re, + double **rx_sig_im, + uint32_t length, + uint8_t keep_channel) +{ + + int i,ii,j,l; + struct complex rx_tmp,tx; + + double path_loss = pow(10,desc->path_loss_dB/20); + int dd; + dd = abs(desc->channel_offset); + +#ifdef DEBUG_CH + printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length); +#endif + + if (keep_channel) { + // do nothing - keep channel + } else { + random_channel(desc,0); + } + +#ifdef DEBUG_CH + + for (l = 0; l<(int)desc->channel_length; l++) { + printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y); + } + + printf("\n"); +#endif + + for (i=0; i<((int)length-dd); i++) { + for (ii=0; ii<desc->nb_rx; ii++) { + rx_tmp.x = 0; + rx_tmp.y = 0; + + for (j=0; j<desc->nb_tx; j++) { + for (l = 0; l<(int)desc->channel_length; l++) { + if ((i>=0) && (i-l)>=0) { + tx.x = tx_sig_re[j][i-l]; + tx.y = tx_sig_im[j][i-l]; + } else { + tx.x =0; + tx.y =0; + } + + rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y); + rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y); + } //l + } // j + + rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss; + rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss; + /* + if ((ii==0)&&((i%32)==0)) { + printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]); + } + */ + //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]); + //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]); + + } // ii + } // i +} +#endif + + -- GitLab