From 8c5e8126df86d625a3ec448542a24380e30c9b7d Mon Sep 17 00:00:00 2001
From: Florian Kaltenberger <florian.kaltenberger@eurecom.fr>
Date: Mon, 8 Jun 2015 12:45:16 +0000
Subject: [PATCH] Added support for ARM NEON, lots of changes in openair1 and
 some in cmake_targets

git-svn-id: http://svn.eurecom.fr/openair4G/trunk@7543 818b1a75-f10b-46b9-bf7c-635c3b92a50f
---
 cmake_targets/CMakeLists.txt                  |    9 +-
 cmake_targets/lte-simulators/CMakeLists.txt   |    1 +
 openair1/PHY/CODING/3gpplte.c                 |    5 +-
 openair1/PHY/CODING/3gpplte_sse.c             |  880 +++---
 .../CODING/3gpplte_turbo_decoder_sse_16bit.c  |  497 +++-
 .../CODING/3gpplte_turbo_decoder_sse_8bit.c   |  594 +++-
 openair1/PHY/CODING/Makefile                  |   24 +-
 openair1/PHY/CODING/ccoding_byte_lte.c        |   35 +-
 openair1/PHY/CODING/defs.h                    |    4 +-
 openair1/PHY/CODING/viterbi.c                 |  220 +-
 openair1/PHY/CODING/viterbi_lte.c             |  264 +-
 openair1/PHY/LTE_ESTIMATION/filt96_32.h       |  112 +-
 .../PHY/LTE_ESTIMATION/freq_equalization.c    |   22 +-
 .../lte_dl_channel_estimation.c               |  133 +-
 .../PHY/LTE_ESTIMATION/lte_est_freq_offset.c  |   31 +-
 .../PHY/LTE_ESTIMATION/lte_sync_timefreq.c    |    8 +-
 .../PHY/LTE_ESTIMATION/lte_ue_measurements.c  |   90 +-
 .../lte_ul_channel_estimation.c               |   92 +-
 openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c    |    4 +-
 openair1/PHY/LTE_TRANSPORT/dci.c              |  152 +-
 openair1/PHY/LTE_TRANSPORT/defs.h             |    2 +-
 openair1/PHY/LTE_TRANSPORT/dlsch_coding.c     |   18 +-
 .../PHY/LTE_TRANSPORT/dlsch_demodulation.c    | 1274 ++++----
 .../PHY/LTE_TRANSPORT/dlsch_llr_computation.c | 2291 +++++++-------
 openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c |    4 +-
 openair1/PHY/LTE_TRANSPORT/pbch.c             |   66 +-
 openair1/PHY/LTE_TRANSPORT/pmch.c             |  173 +-
 openair1/PHY/LTE_TRANSPORT/prach.c            |    5 +-
 openair1/PHY/LTE_TRANSPORT/proto.h            |    4 +-
 .../PHY/LTE_TRANSPORT/ulsch_demodulation.c    |  428 ++-
 openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c |   23 +-
 openair1/PHY/MODULATION/slot_fep.c            |    9 +-
 openair1/PHY/MODULATION/ul_7_5_kHz.c          |  446 +--
 openair1/PHY/TOOLS/cdot_prod.c                |   49 +-
 openair1/PHY/TOOLS/cmult_sv.c                 |  355 +--
 openair1/PHY/TOOLS/cmult_vv.c                 | 1787 +----------
 openair1/PHY/TOOLS/defs.h                     |   47 +-
 openair1/PHY/TOOLS/lte_dfts.c                 | 2647 ++++++++++-------
 openair1/PHY/TOOLS/signal_energy.c            |  132 +-
 openair1/PHY/TOOLS/time_meas.c                |    9 -
 openair1/PHY/TOOLS/time_meas.h                |   56 +-
 openair1/PHY/TOOLS/vars.h                     |    2 +-
 openair1/PHY/defs.h                           |   21 +-
 openair1/SIMULATION/LTE_PHY/dlsim.c           |   24 +-
 openair1/SIMULATION/TOOLS/multipath_channel.c |  444 +--
 45 files changed, 6993 insertions(+), 6500 deletions(-)

diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt
index ebe5c52243..9537477913 100644
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -126,7 +126,7 @@ add_list_string_option(CMAKE_BUILD_TYPE "RelWithDebInfo" "Choose the type of bui
 
 Message("Architecture is ${CMAKE_SYSTEM_PROCESSOR}")
 if (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
-  set(C_FLAGS_PROCESSOR "-mfloat-abi=softfp -mfpu=neon")
+  set(C_FLAGS_PROCESSOR "-gdwarf-2 -mfloat-abi=hard -mfpu=neon -lgcc -lrt")
 else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
   set(C_FLAGS_PROCESSOR "-msse4.2")
 endif()
@@ -140,8 +140,8 @@ set(CMAKE_C_FLAGS
 # set a flag for changes in the source code
 # these changes are related to hardcoded path to include .h files
 add_definitions(-DCMAKER)
-set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -ggdb -DMALLOC_CHECK_=3")
-set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -ggdb -DMALLOC_CHECK_=3 -O2") 
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2") 
 
 # Below has been put in comment because does not work with
 # SVN authentication.
@@ -778,7 +778,6 @@ set(PHY_SRC
   ${OPENAIR1_DIR}/PHY/TOOLS/log2_approx.c
   ${OPENAIR1_DIR}/PHY/TOOLS/cmult_sv.c
   ${OPENAIR1_DIR}/PHY/TOOLS/cmult_vv.c
-  ${OPENAIR1_DIR}/PHY/TOOLS/cadd_vv.c
   ${OPENAIR1_DIR}/PHY/TOOLS/cdot_prod.c
   ${OPENAIR1_DIR}/PHY/TOOLS/signal_energy.c
   ${OPENAIR1_DIR}/PHY/TOOLS/dB_routines.c
@@ -1692,7 +1691,7 @@ foreach(myExe dlsim ulsim pbchsim scansim mbmssim pdcchsim pucchsim prachsim syn
     ${XFORMS_SOURCE}
     )
   target_link_libraries (${myExe}
-    -Wl,--start-group SIMU UTIL SCHED_LIB PHY LFDS MSC ${ITTI_LIB} -Wl,--end-group
+    -Wl,--start-group SIMU UTIL SCHED_LIB PHY LFDS ${ITTI_LIB} -Wl,--end-group
     pthread m rt ${CONFIG_LIBRARIES} ${ATLAS_LIBRARIES} ${XFORMS_LIBRARIES}
     )
 endforeach(myExe)
diff --git a/cmake_targets/lte-simulators/CMakeLists.txt b/cmake_targets/lte-simulators/CMakeLists.txt
index 7e38c9d594..50a473dd4b 100644
--- a/cmake_targets/lte-simulators/CMakeLists.txt
+++ b/cmake_targets/lte-simulators/CMakeLists.txt
@@ -10,5 +10,6 @@ set(RANDOM_BF False)
 set(PBS_SIM False)
 set(PERFECT_CE False)
 set(NAS_UE False)
+set(MESSAGE_CHART_GENERATOR False)
 
 include(${CMAKE_CURRENT_SOURCE_DIR}/../CMakeLists.txt)
diff --git a/openair1/PHY/CODING/3gpplte.c b/openair1/PHY/CODING/3gpplte.c
index 0f7dbf7468..fd33bf999c 100644
--- a/openair1/PHY/CODING/3gpplte.c
+++ b/openair1/PHY/CODING/3gpplte.c
@@ -31,8 +31,9 @@
    author: raymond.knopp@eurecom.fr
    date: 10.2009
 */
-#include "defs.h"
-//#include "lte_interleaver_inline.h"
+#ifndef TC_MAIN
+//#include "defs.h"
+#endif
 
 #include "extern_3GPPinterleaver.h"
 
diff --git a/openair1/PHY/CODING/3gpplte_sse.c b/openair1/PHY/CODING/3gpplte_sse.c
index 850a8ecb5f..c66a42016d 100755
--- a/openair1/PHY/CODING/3gpplte_sse.c
+++ b/openair1/PHY/CODING/3gpplte_sse.c
@@ -1,349 +1,531 @@
-/*******************************************************************************
-    OpenAirInterface
-    Copyright(c) 1999 - 2014 Eurecom
-
-    OpenAirInterface is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-
-    OpenAirInterface is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with OpenAirInterface.The full GNU General Public License is
-   included in this distribution in the file called "COPYING". If not,
-   see <http://www.gnu.org/licenses/>.
-
-  Contact Information
-  OpenAirInterface Admin: openair_admin@eurecom.fr
-  OpenAirInterface Tech : openair_tech@eurecom.fr
-  OpenAirInterface Dev  : openair4g-devel@eurecom.fr
-
-  Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
-
- *******************************************************************************/
-/* file: 3gpplte_sse.c
-   purpose: Encoding routines for implementing Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03
-   author: Laurent Thomas
-   maintainer: raymond.knopp@eurecom.fr
-   date: 09.2012
-*/
-#include "defs.h"
-#include "extern_3GPPinterleaver.h"
-#include <stdlib.h>
-
-#include "PHY/sse_intrin.h"
-
-//#define DEBUG_TURBO_ENCODER 1
-#define CALLGRIND 1
-unsigned short threegpplte_interleaver_output;
-unsigned long long threegpplte_interleaver_tmp;
-
-struct treillis {
-  union {
-    __m64 systematic_64[3];
-    char systematic_8[24];
-  };
-  union {
-    __m64 parity1_64[3];
-    char parity1_8[24];
-  };
-  union {
-    __m64 parity2_64[3];
-    char parity2_8[24];
-  };
-  int exit_state;
-}  __attribute__ ((aligned(64)));
-
-struct treillis all_treillis[8][256];
-int all_treillis_initialized=0;
-
-static inline unsigned char threegpplte_rsc(unsigned char input,unsigned char *state)
-{
-  unsigned char output;
-  output = (input ^ (*state>>2) ^ (*state>>1))&1;
-  *state = (((input<<2)^(*state>>1))^((*state>>1)<<2)^((*state)<<2))&7;
-  return(output);
-}
-
-static inline void threegpplte_rsc_termination(unsigned char *x,unsigned char *z,unsigned char *state)
-{
-  *z     = ((*state>>2) ^ (*state))   &1;
-  *x     = ((*state)    ^ (*state>>1))   &1;
-  *state = (*state)>>1;
-}
-
-void treillis_table_init(void)
-{
-  //struct treillis t[][]=all_treillis;
-  //t=memalign(16,sizeof(struct treillis)*8*256);
-  int i, j,b;
-  unsigned char v, current_state;
-
-  // clear all_treillis
-  for (i=0; i<8; i++)
-    bzero( all_treillis[i], sizeof(all_treillis[0]) );
-
-  for (i=0; i<8; i++) { //all possible initial states
-    for (j=0; j<=255; j++) { // all possible values of a byte
-      current_state=i;
-
-      for (b=0; b<8 ; b++ ) { // pre-compute the image of the byte j in _m128i vector right place
-        all_treillis[i][j].systematic_8[b*3]= (j&(1<<(7-b)))>>(7-b);
-        v=threegpplte_rsc( all_treillis[i][j].systematic_8[b*3] ,
-                           &current_state);
-        all_treillis[i][j].parity1_8[b*3+1]=v; // for the yparity1
-        all_treillis[i][j].parity2_8[b*3+2]=v; // for the yparity2
-      }
-
-      all_treillis[i][j].exit_state=current_state;
-    }
-  }
-
-  all_treillis_initialized=1;
-  return ;
-}
-
-
-char interleave_compact_byte(short * base_interleaver,unsigned char * input, unsigned char * output, int n)
-{
-
-  char expandInput[768*8] __attribute__((aligned(16)));
-  int i,loop=n>>4;
-  __m128i *i_128=(__m128i *)input, *o_128=(__m128i*)expandInput;
-  __m128i tmp1, tmp2, tmp3, tmp4;
-  __m128i BIT_MASK = _mm_set_epi8(  0b00000001,
-                                    0b00000010,
-                                    0b00000100,
-                                    0b00001000,
-                                    0b00010000,
-                                    0b00100000,
-                                    0b01000000,
-                                    0b10000000,
-                                    0b00000001,
-                                    0b00000010,
-                                    0b00000100,
-                                    0b00001000,
-                                    0b00010000,
-                                    0b00100000,
-                                    0b01000000,
-                                    0b10000000);
-
-  if ((n&15) > 0)
-    loop++;
-
-  for (i=0; i<loop ; i++ ) {
-    /* int cur_byte=i<<3; */
-    /* for (b=0;b<8;b++) */
-    /*   expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b); */
-    tmp1=_mm_load_si128(i_128++);
-    tmp2=_mm_unpacklo_epi8(tmp1,tmp1);
-    tmp3=_mm_unpacklo_epi16(tmp2,tmp2);
-    tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
-    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);
-
-    tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
-    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
-
-    tmp3=_mm_unpackhi_epi16(tmp2,tmp2);
-    tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
-    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
-
-    tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
-    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
-
-    tmp2=_mm_unpackhi_epi8(tmp1,tmp1);
-    tmp3=_mm_unpacklo_epi16(tmp2,tmp2);
-    tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
-    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
-
-    tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
-    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
-
-    tmp3=_mm_unpackhi_epi16(tmp2,tmp2);
-    tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
-    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
-
-    tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
-    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
-  }
-
-  short * ptr_intl=base_interleaver;
-  __m128i tmp;
-  int input_length_words=n>>1;
-  unsigned short * systematic2_ptr=(unsigned short *) output;
-
-  //   int j;
-  for ( i=0; i<  input_length_words ; i ++ ) {
-
-    //    for (j=0;j<16;j++) printf("%d(%d).",ptr_intl[j],expandInput[ptr_intl[j]]);
-    //    printf("\n");
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],7);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],6);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],5);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],4);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],3);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],2);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],1);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],0);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+7);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+6);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+5);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+4);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+3);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+2);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+1);
-    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+0);
-    *systematic2_ptr++=(unsigned short)_mm_movemask_epi8(tmp);
-  }
-
-  return n;
-}
-
-
-
-#define _mm_expand_si128(xmmx, out, bit_mask)   \
-  {             \
-   __m128i loc_mm;          \
-   loc_mm=(xmmx);         \
-   loc_mm=_mm_and_si128(loc_mm,bit_mask);   \
-   out=_mm_cmpeq_epi8(loc_mm,bit_mask);   \
-  }
-
-void threegpplte_turbo_encoder(unsigned char *input,
-                               unsigned short input_length_bytes,
-                               unsigned char *output,
-                               unsigned char F,
-                               unsigned short interleaver_f1,
-                               unsigned short interleaver_f2)
-{
-
-  int i;
-  unsigned char *x;
-  unsigned char state0=0,state1=0;
-  unsigned short input_length_bits = input_length_bytes<<3;
-  short * base_interleaver;
-
-  if (  all_treillis_initialized == 0 )
-    treillis_table_init();
-
-  // look for f1 and f2 precomputed interleaver values
-  for (i=0; i < 188 && f1f2mat[i].nb_bits != input_length_bits; i++);
-
-  if ( i == 188 ) {
-    msg("Illegal frame length!\n");
-    return;
-  } else {
-    base_interleaver=il_tb+f1f2mat[i].beg_index;
-  }
-
-
-  unsigned char systematic2[768];
-  interleave_compact_byte(base_interleaver,input,systematic2,input_length_bytes);
-
-  __m64 *ptr_output=(__m64*) output;
-  unsigned char cur_s1, cur_s2;
-  int code_rate;
-
-  for ( state0=state1=i=0 ; i<input_length_bytes; i++ ) {
-    cur_s1=input[i];
-    cur_s2=systematic2[i];
-
-    for ( code_rate=0; code_rate<3; code_rate++) {
-      *ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_64[code_rate],
-                                  _mm_add_pi8(all_treillis[state0][cur_s1].parity1_64[code_rate],
-                                              all_treillis[state1][cur_s2].parity2_64[code_rate]));
-    }
-
-    state0=all_treillis[state0][cur_s1].exit_state;
-    state1=all_treillis[state1][cur_s2].exit_state;
-  }
-
-  x=output+(input_length_bits*3);
-
-  // Trellis termination
-  threegpplte_rsc_termination(&x[0],&x[1],&state0);
-#ifdef DEBUG_TURBO_ENCODER
-  printf("term: x0 %d, x1 %d, state0 %d\n",x[0],x[1],state0);
-#endif //DEBUG_TURBO_ENCODER
-
-  threegpplte_rsc_termination(&x[2],&x[3],&state0);
-#ifdef DEBUG_TURBO_ENCODER
-  printf("term: x0 %d, x1 %d, state0 %d\n",x[2],x[3],state0);
-#endif //DEBUG_TURBO_ENCODER
-
-  threegpplte_rsc_termination(&x[4],&x[5],&state0);
-#ifdef DEBUG_TURBO_ENCODER
-  printf("term: x0 %d, x1 %d, state0 %d\n",x[4],x[5],state0);
-#endif //DEBUG_TURBO_ENCODER
-
-  threegpplte_rsc_termination(&x[6],&x[7],&state1);
-
-#ifdef DEBUG_TURBO_ENCODER
-  printf("term: x0 %d, x1 %d, state1 %d\n",x[6],x[7],state1);
-#endif //DEBUG_TURBO_ENCODER
-  threegpplte_rsc_termination(&x[8],&x[9],&state1);
-#ifdef DEBUG_TURBO_ENCODER
-  printf("term: x0 %d, x1 %d, state1 %d\n",x[8],x[9],state1);
-#endif //DEBUG_TURBO_ENCODER
-  threegpplte_rsc_termination(&x[10],&x[11],&state1);
-
-#ifdef DEBUG_TURBO_ENCODER
-  printf("term: x0 %d, x1 %d, state1 %d\n",x[10],x[11],state1);
-#endif //DEBUG_TURBO_ENCODER
-
-  _mm_empty();
-  _m_empty();
-}
-
-
-
-#ifdef MAIN
-
-#define INPUT_LENGTH 5
-#define F1 3
-#define F2 10
-
-int main(int argc,char **argv)
-{
-
-  unsigned char input[INPUT_LENGTH],state,state2;
-  unsigned char output[12+(3*(INPUT_LENGTH<<3))],x,z;
-  int i;
-  unsigned char out;
-
-  for (state=0; state<8; state++) {
-    for (i=0; i<2; i++) {
-      state2=state;
-      out = threegpplte_rsc(i,&state2);
-      printf("State (%d->%d) : (%d,%d)\n",state,state2,i,out);
-    }
-  }
-
-  printf("\n");
-
-  for (state=0; state<8; state++) {
-
-    state2=state;
-    threegpplte_rsc_termination(&x,&z,&state2);
-    printf("Termination: (%d->%d) : (%d,%d)\n",state,state2,x,z);
-  }
-
-  for (i=0; i<5; i++) {
-    input[i] = i*219;
-    printf("Input %d : %x\n",i,input[i]);
-  }
-
-  threegpplte_turbo_encoder(&input[0],
-                            5,
-                            &output[0],
-                            F1,
-                            F2);
-  return(0);
-}
-
-#endif // MAIN
+/*******************************************************************************
+    OpenAirInterface
+    Copyright(c) 1999 - 2014 Eurecom
+
+    OpenAirInterface is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+
+    OpenAirInterface is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenAirInterface.The full GNU General Public License is
+   included in this distribution in the file called "COPYING". If not,
+   see <http://www.gnu.org/licenses/>.
+
+  Contact Information
+  OpenAirInterface Admin: openair_admin@eurecom.fr
+  OpenAirInterface Tech : openair_tech@eurecom.fr
+  OpenAirInterface Dev  : openair4g-devel@eurecom.fr
+
+  Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
+
+ *******************************************************************************/
+/* file: 3gpplte_sse.c
+   purpose: Encoding routines for implementing Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03
+   author: Laurent Thomas
+   maintainer: raymond.knopp@eurecom.fr
+   date: 09.2012
+*/
+#ifndef TC_MAIN
+#include "defs.h"
+#include "extern_3GPPinterleaver.h"
+#else
+#include "vars.h"
+#endif
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "PHY/sse_intrin.h"
+
+#define print_bytes(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15])
+#define print_shorts(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
+#define print_ints(s,x) printf("%s %x %x %x %x\n",s,(x)[0],(x)[1],(x)[2],(x)[3])
+
+
+//#define DEBUG_TURBO_ENCODER 1
+#define CALLGRIND 1
+unsigned short threegpplte_interleaver_output;
+unsigned long long threegpplte_interleaver_tmp;
+
+#if defined(__x86_64__) || defined(__i386__)
+struct treillis {
+  union {
+    __m64 systematic_64[3];
+    char systematic_8[24];
+  };
+  union {
+    __m64 parity1_64[3];
+    char parity1_8[24];
+  };
+  union {
+    __m64 parity2_64[3];
+    char parity2_8[24];
+  };
+  int exit_state;
+}  __attribute__ ((aligned(64)));
+
+#elif defined(__arm__)
+
+struct treillis {
+  union {
+    uint8x8_t systematic_64[3];
+    char systematic_8[24];
+  }__attribute__((aligned(64)));
+  union {
+    uint8x8_t parity1_64[3];
+    char parity1_8[24];
+  }__attribute__((aligned(64)));
+  union {
+    uint8x8_t parity2_64[3];
+    char parity2_8[24];
+  }__attribute__((aligned(64)));
+  int exit_state;
+};
+#endif
+
+struct treillis all_treillis[8][256];
+int all_treillis_initialized=0;
+
+static inline unsigned char threegpplte_rsc(unsigned char input,unsigned char *state)
+{
+  unsigned char output;
+  output = (input ^ (*state>>2) ^ (*state>>1))&1;
+  *state = (((input<<2)^(*state>>1))^((*state>>1)<<2)^((*state)<<2))&7;
+  return(output);
+}
+
+static inline void threegpplte_rsc_termination(unsigned char *x,unsigned char *z,unsigned char *state)
+{
+  *z     = ((*state>>2) ^ (*state))   &1;
+  *x     = ((*state)    ^ (*state>>1))   &1;
+  *state = (*state)>>1;
+}
+
+void treillis_table_init(void)
+{
+  //struct treillis t[][]=all_treillis;
+  //t=memalign(16,sizeof(struct treillis)*8*256);
+  int i, j,b;
+  unsigned char v, current_state;
+
+  // clear all_treillis
+  for (i=0; i<8; i++)
+    bzero( all_treillis[i], sizeof(all_treillis[0]) );
+
+  for (i=0; i<8; i++) { //all possible initial states
+    for (j=0; j<=255; j++) { // all possible values of a byte
+      current_state=i;
+
+      for (b=0; b<8 ; b++ ) { // pre-compute the image of the byte j in _m128i vector right place
+        all_treillis[i][j].systematic_8[b*3]= (j&(1<<(7-b)))>>(7-b);
+        v=threegpplte_rsc( all_treillis[i][j].systematic_8[b*3] ,
+                           &current_state);
+        all_treillis[i][j].parity1_8[b*3+1]=v; // for the yparity1
+        all_treillis[i][j].parity2_8[b*3+2]=v; // for the yparity2
+      }
+
+      all_treillis[i][j].exit_state=current_state;
+    }
+  }
+
+  all_treillis_initialized=1;
+  return ;
+}
+
+
+char interleave_compact_byte(short * base_interleaver,unsigned char * input, unsigned char * output, int n)
+{
+
+  char expandInput[768*8] __attribute__((aligned(16)));
+  int i,loop=n>>4;
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i *i_128=(__m128i *)input, *o_128=(__m128i*)expandInput;
+  __m128i tmp1, tmp2, tmp3, tmp4;
+  __m128i BIT_MASK = _mm_set_epi8(  0b00000001,
+                                    0b00000010,
+                                    0b00000100,
+                                    0b00001000,
+                                    0b00010000,
+                                    0b00100000,
+                                    0b01000000,
+                                    0b10000000,
+                                    0b00000001,
+                                    0b00000010,
+                                    0b00000100,
+                                    0b00001000,
+                                    0b00010000,
+                                    0b00100000,
+                                    0b01000000,
+                                    0b10000000);
+#elif defined(__arm__)
+  uint8x16_t *i_128=(uint8x16_t *)input, *o_128=(uint8x16_t *)expandInput;
+  uint8x16_t tmp1,tmp2;
+  uint16x8_t tmp3;
+  uint32x4_t tmp4;
+  uint8x16_t and_tmp;
+  uint8x16_t BIT_MASK = {  	    0b10000000,
+                                    0b01000000,
+                                    0b00100000,
+                                    0b00010000,
+                                    0b00001000,
+                                    0b00000100,
+                                    0b00000010,
+                                    0b00000001,
+                                    0b10000000,
+                                    0b01000000,
+                                    0b00100000,
+                                    0b00010000,
+                                    0b00001000,
+                                    0b00000100,
+                                    0b00000010,
+                                    0b00000001};
+#endif
+  if ((n&15) > 0)
+    loop++;
+
+  for (i=0; i<loop ; i++ ) {
+    /* int cur_byte=i<<3; */
+    /* for (b=0;b<8;b++) */
+    /*   expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b); */
+
+#if defined(__x86_64__) || defined(__i386__)
+    tmp1=_mm_load_si128(i_128++);
+    tmp2=_mm_unpacklo_epi8(tmp1,tmp1);
+    tmp3=_mm_unpacklo_epi16(tmp2,tmp2);
+    tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
+    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);
+
+    tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
+    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
+
+    tmp3=_mm_unpackhi_epi16(tmp2,tmp2);
+    tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
+    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
+
+    tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
+    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
+
+    tmp2=_mm_unpackhi_epi8(tmp1,tmp1);
+    tmp3=_mm_unpacklo_epi16(tmp2,tmp2);
+    tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
+    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
+
+    tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
+    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
+
+    tmp3=_mm_unpackhi_epi16(tmp2,tmp2);
+    tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
+    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
+
+    tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
+    *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
+
+#elif defined(__arm__)
+    tmp1=vld1q_u8((uint8_t*)i_128);
+    //print_bytes("tmp1:",(uint8_t*)&tmp1);
+
+    uint8x16x2_t temp1 =  vzipq_u8(tmp1,tmp1);
+    tmp2 = temp1.val[0];
+
+    uint16x8x2_t temp2 =  vzipq_u16((uint16x8_t)tmp2,(uint16x8_t)tmp2);
+    tmp3 = temp2.val[0];
+
+    uint32x4x2_t temp3 =  vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3);
+    tmp4 = temp3.val[0];
+    //print_bytes("tmp4:",(uint8_t*)&tmp4);
+
+    *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK);    //1
+    //print_bytes("o:",(uint8_t*)(o_128-1));
+
+    tmp4 = temp3.val[1];
+    //print_bytes("tmp4:",(uint8_t*)&tmp4);
+
+    *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK);    //2
+    //print_bytes("o:",(uint8_t*)(o_128-1));
+
+    tmp3 = temp2.val[1];
+    temp3 =  vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3);
+    tmp4 = temp3.val[0];
+    //print_bytes("tmp4:",(uint8_t*)&tmp4);
+
+    *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK);    //3
+    //print_bytes("o:",(uint8_t*)(o_128-1));
+
+    tmp4 = temp3.val[1];
+    //print_bytes("tmp4:",(uint8_t*)&tmp4);
+
+    *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK);    //4
+    //and_tmp = vandq_u8((uint8x16_t)tmp4,BIT_MASK); print_bytes("and:",and_tmp); 
+    //print_bytes("o:",(uint8_t*)(o_128-1));
+
+
+    temp1 =  vzipq_u8(tmp1,tmp1);
+    tmp2 = temp1.val[1];
+    temp2 =  vzipq_u16((uint16x8_t)tmp2,(uint16x8_t)tmp2);
+    tmp3 = temp2.val[0];
+    temp3 =  vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3);
+    tmp4 = temp3.val[0];
+    //print_bytes("tmp4:",(uint8_t*)&tmp4);
+
+    *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK);    //5
+    //print_bytes("o:",(uint8_t*)(o_128-1));
+
+    tmp4 = temp3.val[1];
+    //print_bytes("tmp4:",(uint8_t*)&tmp4);
+
+    *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK);    //6
+    //print_bytes("o:",(uint8_t*)(o_128-1));
+
+
+    temp2 =  vzipq_u16((uint16x8_t)tmp2,(uint16x8_t)tmp2);
+    tmp3 = temp2.val[1];
+    temp3 =  vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3);
+    tmp4 = temp3.val[0];
+    //print_bytes("tmp4:",(uint8_t*)&tmp4);
+
+    *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK);    //7
+    //print_bytes("o:",(uint8_t*)(o_128-1));
+
+    tmp4 = temp3.val[1];
+    //print_bytes("tmp4:",(uint8_t*)&tmp4);
+
+    *o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK);    //7
+    //print_bytes("o:",(uint8_t*)(o_128-1));
+
+    i_128++;
+#endif
+  }
+
+  short * ptr_intl=base_interleaver;
+#if defined(__x86_64) || defined(__i386__)
+  __m128i tmp;
+ uint16_t *systematic2_ptr=(unsigned short *) output;
+#elif defined(__arm__)
+  uint8x16_t tmp;
+  const uint8_t __attribute__ ((aligned (16))) _Powers[16]= 
+    { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
+
+// Set the powers of 2 (do it once for all, if applicable)
+  uint8x16_t Powers= vld1q_u8(_Powers);
+  uint8_t *systematic2_ptr=(uint8_t *) output;
+#endif
+  int input_length_words=n>>1;
+
+  for ( i=0; i<  input_length_words ; i ++ ) {
+
+#if defined(__x86_64__) || defined(__i386__)
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],7);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],6);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],5);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],4);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],3);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],2);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],1);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],0);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+7);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+6);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+5);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+4);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+3);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+2);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+1);
+    tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+0);
+    *systematic2_ptr++=(unsigned short)_mm_movemask_epi8(tmp);
+#elif defined(__arm__)
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,7);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,6);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,5);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,4);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,3);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,2);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,1);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,0);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+7);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+6);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+5);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+4);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+3);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+2);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+1);
+    tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+0);
+// Compute the mask from the input
+    uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers))));
+    vst1q_lane_u8(systematic2_ptr++, (uint8x16_t)Mask, 0);
+    vst1q_lane_u8(systematic2_ptr++, (uint8x16_t)Mask, 8);
+
+#endif
+  }
+
+  return n;
+}
+
+
+/*
+#define _mm_expand_si128(xmmx, out, bit_mask)   \
+  {             \
+   __m128i loc_mm;          \
+   loc_mm=(xmmx);         \
+   loc_mm=_mm_and_si128(loc_mm,bit_mask);   \
+   out=_mm_cmpeq_epi8(loc_mm,bit_mask);   \
+  }
+*/
+
+void threegpplte_turbo_encoder(unsigned char *input,
+                               unsigned short input_length_bytes,
+                               unsigned char *output,
+                               unsigned char F,
+                               unsigned short interleaver_f1,
+                               unsigned short interleaver_f2)
+{
+
+  int i;
+  unsigned char *x;
+  unsigned char state0=0,state1=0;
+  unsigned short input_length_bits = input_length_bytes<<3;
+  short * base_interleaver;
+
+  if (  all_treillis_initialized == 0 )
+    treillis_table_init();
+
+  // look for f1 and f2 precomputed interleaver values
+  for (i=0; i < 188 && f1f2mat[i].nb_bits != input_length_bits; i++);
+
+  if ( i == 188 ) {
+    printf("Illegal frame length!\n");
+    return;
+  } else {
+    base_interleaver=il_tb+f1f2mat[i].beg_index;
+  }
+
+
+  unsigned char systematic2[768];
+  interleave_compact_byte(base_interleaver,input,systematic2,input_length_bytes);
+
+#if defined(__x86_64__) || defined(__i386__)
+  __m64 *ptr_output=(__m64*) output;
+#elif defined(__arm__)
+  uint8x8_t *ptr_output=(uint8x8_t*)output; 
+#endif
+  unsigned char cur_s1, cur_s2;
+  int code_rate;
+
+  for ( state0=state1=i=0 ; i<input_length_bytes; i++ ) {
+    cur_s1=input[i];
+    cur_s2=systematic2[i];
+
+    for ( code_rate=0; code_rate<3; code_rate++) {
+#if defined(__x86_64__) || defined(__i386__)
+      *ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_64[code_rate],
+                                  _mm_add_pi8(all_treillis[state0][cur_s1].parity1_64[code_rate],
+                                              all_treillis[state1][cur_s2].parity2_64[code_rate]));
+#elif defined(__arm__)
+      uint8x8_t ptmp = vadd_u8(all_treillis[state0][cur_s1].parity1_64[code_rate],
+                               all_treillis[state1][cur_s2].parity2_64[code_rate]);
+      *ptr_output++ = vadd_u8(all_treillis[state0][cur_s1].systematic_64[code_rate],
+                              ptmp);
+#endif
+    }
+
+    state0=all_treillis[state0][cur_s1].exit_state;
+    state1=all_treillis[state1][cur_s2].exit_state;
+  }
+
+  x=output+(input_length_bits*3);
+
+  // Trellis termination
+  threegpplte_rsc_termination(&x[0],&x[1],&state0);
+#ifdef DEBUG_TURBO_ENCODER
+  printf("term: x0 %d, x1 %d, state0 %d\n",x[0],x[1],state0);
+#endif //DEBUG_TURBO_ENCODER
+
+  threegpplte_rsc_termination(&x[2],&x[3],&state0);
+#ifdef DEBUG_TURBO_ENCODER
+  printf("term: x0 %d, x1 %d, state0 %d\n",x[2],x[3],state0);
+#endif //DEBUG_TURBO_ENCODER
+
+  threegpplte_rsc_termination(&x[4],&x[5],&state0);
+#ifdef DEBUG_TURBO_ENCODER
+  printf("term: x0 %d, x1 %d, state0 %d\n",x[4],x[5],state0);
+#endif //DEBUG_TURBO_ENCODER
+
+  threegpplte_rsc_termination(&x[6],&x[7],&state1);
+
+#ifdef DEBUG_TURBO_ENCODER
+  printf("term: x0 %d, x1 %d, state1 %d\n",x[6],x[7],state1);
+#endif //DEBUG_TURBO_ENCODER
+  threegpplte_rsc_termination(&x[8],&x[9],&state1);
+#ifdef DEBUG_TURBO_ENCODER
+  printf("term: x0 %d, x1 %d, state1 %d\n",x[8],x[9],state1);
+#endif //DEBUG_TURBO_ENCODER
+  threegpplte_rsc_termination(&x[10],&x[11],&state1);
+
+#ifdef DEBUG_TURBO_ENCODER
+  printf("term: x0 %d, x1 %d, state1 %d\n",x[10],x[11],state1);
+#endif //DEBUG_TURBO_ENCODER
+#if defined(__x86_64__) || defined(__i386__)
+  _mm_empty();
+  _m_empty();
+#endif
+}
+
+
+
+#ifdef TC_MAIN
+#define INPUT_LENGTH 20 
+#define F1 21
+#define F2 120
+
+int main(int argc,char **argv)
+{
+
+  unsigned char input[INPUT_LENGTH+16],state,state2;
+  unsigned char output[12+(3*(INPUT_LENGTH<<3))],x,z;
+  int i;
+  unsigned char out;
+
+  for (state=0; state<8; state++) {
+    for (i=0; i<2; i++) {
+      state2=state;
+      out = threegpplte_rsc(i,&state2);
+      printf("State (%d->%d) : (%d,%d)\n",state,state2,i,out);
+    }
+  }
+
+  printf("\n");
+
+  for (state=0; state<8; state++) {
+
+    state2=state;
+    threegpplte_rsc_termination(&x,&z,&state2);
+    printf("Termination: (%d->%d) : (%d,%d)\n",state,state2,x,z);
+  }
+
+  memset((void*)input,0,INPUT_LENGTH+16);
+  for (i=0; i<INPUT_LENGTH; i++) {
+    input[i] = i*219;
+    printf("Input %d : %x\n",i,input[i]);
+  }
+
+  threegpplte_turbo_encoder(&input[0],
+                            INPUT_LENGTH,
+                            &output[0],
+                            0,
+                            F1,
+                            F2);
+
+
+  for (i=0;i<12+(INPUT_LENGTH*24);i++)
+    printf("%d",output[i]);
+  printf("\n");
+
+  return(0);
+}
+
+#endif // MAIN
diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
index 66c5e9ace7..96813c1072 100644
--- a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
+++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
@@ -64,6 +64,8 @@
 #endif
 
 
+#define print_shorts(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
+
 //#define DEBUG_LOGMAP
 
 
@@ -120,10 +122,17 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
                      unsigned short frame_length,unsigned char term_flag)
 {
   int k,K1;
+#if defined(__x86_64__)||defined(__i386__)
   __m128i *systematic128 = (__m128i *)systematic;
   __m128i *y_parity128   = (__m128i *)y_parity;
   __m128i *m10_128        = (__m128i *)m10;
   __m128i *m11_128        = (__m128i *)m11;
+#elif defined(__arm__)
+  int16x8_t *systematic128  = (int16x8_t *)systematic;
+  int16x8_t *y_parity128    = (int16x8_t *)y_parity;
+  int16x8_t *m10_128        = (int16x8_t *)m10;
+  int16x8_t *m11_128        = (int16x8_t *)m11;
+#endif
 
 #ifdef DEBUG_LOGMAP
   msg("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
@@ -132,61 +141,31 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
   K1=frame_length>>3;
 
   for (k=0; k<K1; k++) {
-
+#if defined(__x86_64__) || defined(__i386__)
     m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k],y_parity128[k]),1);
     m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k],y_parity128[k]),1);
-    /*
-    printf("gamma %d: s %d,%d,%d,%d,%d,%d,%d,%d\n",
-     k,
-     (int16_t)_mm_extract_epi16(systematic128[k],0),
-     (int16_t)_mm_extract_epi16(systematic128[k],1),
-     (int16_t)_mm_extract_epi16(systematic128[k],2),
-     (int16_t)_mm_extract_epi16(systematic128[k],3),
-     (int16_t)_mm_extract_epi16(systematic128[k],4),
-     (int16_t)_mm_extract_epi16(systematic128[k],5),
-     (int16_t)_mm_extract_epi16(systematic128[k],6),
-     (int16_t)_mm_extract_epi16(systematic128[k],7));
-
-    printf("gamma %d: yp %d,%d,%d,%d,%d,%d,%d,%d\n",
-     k,
-     (int16_t)_mm_extract_epi16(y_parity128[k],0),
-     (int16_t)_mm_extract_epi16(y_parity128[k],1),
-     (int16_t)_mm_extract_epi16(y_parity128[k],2),
-     (int16_t)_mm_extract_epi16(y_parity128[k],3),
-     (int16_t)_mm_extract_epi16(y_parity128[k],4),
-     (int16_t)_mm_extract_epi16(y_parity128[k],5),
-     (int16_t)_mm_extract_epi16(y_parity128[k],6),
-     (int16_t)_mm_extract_epi16(y_parity128[k],7));
-
-    printf("gamma %d: m11 %d,%d,%d,%d,%d,%d,%d,%d\n",
-     k,
-     (int16_t)_mm_extract_epi16(m11_128[k],0),
-     (int16_t)_mm_extract_epi16(m11_128[k],1),
-     (int16_t)_mm_extract_epi16(m11_128[k],2),
-     (int16_t)_mm_extract_epi16(m11_128[k],3),
-     (int16_t)_mm_extract_epi16(m11_128[k],4),
-     (int16_t)_mm_extract_epi16(m11_128[k],5),
-     (int16_t)_mm_extract_epi16(m11_128[k],6),
-     (int16_t)_mm_extract_epi16(m11_128[k],7));
-    printf("gamma %d: m10 %d,%d,%d,%d,%d,%d,%d,%d\n",
-     k,
-     (int16_t)_mm_extract_epi16(m10_128[k],0),
-     (int16_t)_mm_extract_epi16(m10_128[k],1),
-     (int16_t)_mm_extract_epi16(m10_128[k],2),
-     (int16_t)_mm_extract_epi16(m10_128[k],3),
-     (int16_t)_mm_extract_epi16(m10_128[k],4),
-     (int16_t)_mm_extract_epi16(m10_128[k],5),
-     (int16_t)_mm_extract_epi16(m10_128[k],6),
-     (int16_t)_mm_extract_epi16(m10_128[k],7));
-    */
+#elif defined(__arm__)
+    m11_128[k] = vhaddq_s16(systematic128[k],y_parity128[k]);
+    m10_128[k] = vhsubq_s16(systematic128[k],y_parity128[k]);
+#endif
 
+#ifdef DEBUG_LOGMAP
+    printf("Loop index k, m11,m10\n");
+    print_shorts("sys",(int16_t*)&systematic128[k]);
+    print_shorts("yp",(int16_t*)&y_parity128[k]);
+    print_shorts("m11",(int16_t*)&m11_128[k]);
+    print_shorts("m10",(int16_t*)&m10_128[k]);
+#endif
   }
 
   // Termination
+#if defined(__x86_64__) || defined(__i386__)
   m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1);
   m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
-
-  //  printf("gamma (term): %d,%d, %d,%d, %d,%d\n",m11[k<<3],m10[k<<3],m11[1+(k<<3)],m10[1+(k<<3)],m11[2+(k<<3)],m10[2+(k<<3)]);
+#elif defined(__arm__)
+  m11_128[k] = vhaddq_s16(systematic128[k+term_flag],y_parity128[k]);
+  m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]);
+#endif
 }
 
 #define L 40
@@ -194,19 +173,31 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
 void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned short frame_length,unsigned char F)
 {
   int k,l,l2,K1,rerun_flag=0;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *alpha128=(__m128i *)alpha,*alpha_ptr;
   __m128i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
   __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
   __m128i new0,new1,new2,new3,new4,new5,new6,new7;
   __m128i alpha_max;
-
+#elif defined(__arm__)
+  int16x8_t *alpha128=(int16x8_t *)alpha,*alpha_ptr;
+  int16x8_t a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
+  int16x8_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
+  int16x8_t new0,new1,new2,new3,new4,new5,new6,new7;
+  int16x8_t alpha_max;
+#endif
   l2 = L>>3;
   K1 = (frame_length>>3);
 
   for (l=K1;; l=l2,rerun_flag=1) {
+#if defined(__x86_64__) || defined(__i386__)
     alpha128 = (__m128i *)alpha;
+#elif defined(__arm__)
+    alpha128 = (int16x8_t *)alpha;
+#endif
 
     if (rerun_flag == 0) {
+#if defined(__x86_64__) || defined(__i386__)
       alpha128[0] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0);
       alpha128[1] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
       alpha128[2] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
@@ -215,8 +206,31 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
       alpha128[5] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
       alpha128[6] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
       alpha128[7] = _mm_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
+#elif defined(__arm__)
+      alpha128[0] = vdupq_n_s16(-MAX/2);
+      alpha128[0] = vsetq_lane_s16(0,alpha128[0],0);
+      alpha128[1] = vdupq_n_s16(-MAX/2);
+      alpha128[2] = vdupq_n_s16(-MAX/2);
+      alpha128[3] = vdupq_n_s16(-MAX/2);
+      alpha128[4] = vdupq_n_s16(-MAX/2);
+      alpha128[5] = vdupq_n_s16(-MAX/2);
+      alpha128[6] = vdupq_n_s16(-MAX/2);
+      alpha128[7] = vdupq_n_s16(-MAX/2);
+#endif
+#ifdef DEBUG_LOGMAP
+      printf("Initial alpha\n");
+      print_shorts("a0",(int16_t*)&alpha128[0]);
+      print_shorts("a1",(int16_t*)&alpha128[1]);
+      print_shorts("a2",(int16_t*)&alpha128[2]);
+      print_shorts("a3",(int16_t*)&alpha128[3]);
+      print_shorts("a4",(int16_t*)&alpha128[4]);
+      print_shorts("a5",(int16_t*)&alpha128[5]);
+      print_shorts("a6",(int16_t*)&alpha128[6]);
+      print_shorts("a7",(int16_t*)&alpha128[7]);
+#endif
     } else {
       //set initial alpha in columns 1-7 from final alpha from last run in columns 0-6
+#if defined(__x86_64__) || defined(__i386__)
       alpha128[0] = _mm_slli_si128(alpha128[frame_length],2);
       alpha128[1] = _mm_slli_si128(alpha128[1+frame_length],2);
       alpha128[2] = _mm_slli_si128(alpha128[2+frame_length],2);
@@ -225,6 +239,16 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
       alpha128[5] = _mm_slli_si128(alpha128[5+frame_length],2);
       alpha128[6] = _mm_slli_si128(alpha128[6+frame_length],2);
       alpha128[7] = _mm_slli_si128(alpha128[7+frame_length],2);
+#elif defined(__arm__)
+      alpha128[0] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[frame_length],16);   alpha128[0] = vsetq_lane_s16(alpha[8],alpha128[0],3);
+      alpha128[1] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[1+frame_length],16); alpha128[1] = vsetq_lane_s16(alpha[24],alpha128[0],3);
+      alpha128[2] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[2+frame_length],16); alpha128[2] = vsetq_lane_s16(alpha[40],alpha128[0],3);
+      alpha128[3] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[3+frame_length],16); alpha128[3] = vsetq_lane_s16(alpha[56],alpha128[0],3);
+      alpha128[4] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[4+frame_length],16); alpha128[4] = vsetq_lane_s16(alpha[72],alpha128[0],3);
+      alpha128[5] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[5+frame_length],16); alpha128[5] = vsetq_lane_s16(alpha[88],alpha128[0],3);
+      alpha128[6] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[6+frame_length],16); alpha128[6] = vsetq_lane_s16(alpha[104],alpha128[0],3);
+      alpha128[7] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[7+frame_length],16); alpha128[7] = vsetq_lane_s16(alpha[120],alpha128[0],3);
+#endif
       // set initial alpha in column 0 to (0,-MAX/2,...,-MAX/2)
       alpha[8] = -MAX/2;
       alpha[16] = -MAX/2;
@@ -233,17 +257,33 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
       alpha[40] = -MAX/2;
       alpha[48] = -MAX/2;
       alpha[56] = -MAX/2;
+#ifdef DEBUG_LOGMAP
+      printf("Second run\n");
+      print_shorts("a0",(int16_t*)&alpha128[0]);
+      print_shorts("a1",(int16_t*)&alpha128[1]);
+      print_shorts("a2",(int16_t*)&alpha128[2]);
+      print_shorts("a3",(int16_t*)&alpha128[3]);
+      print_shorts("a4",(int16_t*)&alpha128[4]);
+      print_shorts("a5",(int16_t*)&alpha128[5]);
+      print_shorts("a6",(int16_t*)&alpha128[6]);
+      print_shorts("a7",(int16_t*)&alpha128[7]);
+#endif
+
     }
 
     alpha_ptr = &alpha128[0];
-
+#if defined(__x86_64__) || defined(__i386__)
     m11p = (__m128i*)m_11;
     m10p = (__m128i*)m_10;
-
+#elif defined(__arm__)
+    m11p = (int16x8_t*)m_11;
+    m10p = (int16x8_t*)m_10;
+#endif
     for (k=0;
          k<l;
          k++) {
 
+#if defined(__x86_64__) || defined(__i386__)
       a1=_mm_load_si128(&alpha_ptr[1]);
       a3=_mm_load_si128(&alpha_ptr[3]);
       a5=_mm_load_si128(&alpha_ptr[5]);
@@ -288,10 +328,48 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
       alpha_max = _mm_max_epi16(alpha_max,a5);
       alpha_max = _mm_max_epi16(alpha_max,a6);
       alpha_max = _mm_max_epi16(alpha_max,a7);
+#elif defined(__arm__)
+      m_b0 = vqaddq_s16(alpha_ptr[1],*m11p);  // m11
+      m_b4 = vqsubq_s16(alpha_ptr[1],*m11p);  // m00=-m11
+      m_b1 = vqsubq_s16(alpha_ptr[3],*m10p);  // m01=-m10
+      m_b5 = vqaddq_s16(alpha_ptr[3],*m10p);  // m10
+      m_b2 = vqaddq_s16(alpha_ptr[5],*m10p);  // m10
+      m_b6 = vqsubq_s16(alpha_ptr[5],*m10p);  // m01=-m10
+      m_b3 = vqsubq_s16(alpha_ptr[7],*m11p);  // m00=-m11
+      m_b7 = vqaddq_s16(alpha_ptr[7],*m11p);  // m11
+
+      new0 = vqsubq_s16(alpha_ptr[0],*m11p);  // m00=-m11
+      new4 = vqaddq_s16(alpha_ptr[0],*m11p);  // m11
+      new1 = vqaddq_s16(alpha_ptr[2],*m10p);  // m10
+      new5 = vqsubq_s16(alpha_ptr[2],*m10p);  // m01=-m10
+      new2 = vqsubq_s16(alpha_ptr[4],*m10p);  // m01=-m10
+      new6 = vqaddq_s16(alpha_ptr[4],*m10p);  // m10
+      new3 = vqaddq_s16(alpha_ptr[6],*m11p);  // m11
+      new7 = vqsubq_s16(alpha_ptr[6],*m11p);  // m00=-m11
+      a0 = vmaxq_s16(m_b0,new0);
+      a1 = vmaxq_s16(m_b1,new1);
+      a2 = vmaxq_s16(m_b2,new2);
+      a3 = vmaxq_s16(m_b3,new3);
+      a4 = vmaxq_s16(m_b4,new4);
+      a5 = vmaxq_s16(m_b5,new5);
+      a6 = vmaxq_s16(m_b6,new6);
+      a7 = vmaxq_s16(m_b7,new7);
+
+      // compute and subtract maxima
+      alpha_max = vmaxq_s16(a0,a1);
+      alpha_max = vmaxq_s16(alpha_max,a2);
+      alpha_max = vmaxq_s16(alpha_max,a3);
+      alpha_max = vmaxq_s16(alpha_max,a4);
+      alpha_max = vmaxq_s16(alpha_max,a5);
+      alpha_max = vmaxq_s16(alpha_max,a6);
+      alpha_max = vmaxq_s16(alpha_max,a7);
+
+#endif
 
       alpha_ptr+=8;
       m11p++;
       m10p++;
+#if defined(__x86_64__) || defined(__i386__)
       alpha_ptr[0] = _mm_subs_epi16(a0,alpha_max);
       alpha_ptr[1] = _mm_subs_epi16(a1,alpha_max);
       alpha_ptr[2] = _mm_subs_epi16(a2,alpha_max);
@@ -300,6 +378,58 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
       alpha_ptr[5] = _mm_subs_epi16(a5,alpha_max);
       alpha_ptr[6] = _mm_subs_epi16(a6,alpha_max);
       alpha_ptr[7] = _mm_subs_epi16(a7,alpha_max);
+#elif defined(__arm__)
+      alpha_ptr[0] = vqsubq_s16(a0,alpha_max);
+      alpha_ptr[1] = vqsubq_s16(a1,alpha_max);
+      alpha_ptr[2] = vqsubq_s16(a2,alpha_max);
+      alpha_ptr[3] = vqsubq_s16(a3,alpha_max);
+      alpha_ptr[4] = vqsubq_s16(a4,alpha_max);
+      alpha_ptr[5] = vqsubq_s16(a5,alpha_max);
+      alpha_ptr[6] = vqsubq_s16(a6,alpha_max);
+      alpha_ptr[7] = vqsubq_s16(a7,alpha_max);
+#endif
+
+#ifdef DEBUG_LOGMAP
+      printf("Loop index %d, mb\n",k);
+      print_shorts("mb0",(int16_t*)&m_b0);
+      print_shorts("mb1",(int16_t*)&m_b1);
+      print_shorts("mb2",(int16_t*)&m_b2);
+      print_shorts("mb3",(int16_t*)&m_b3);
+      print_shorts("mb4",(int16_t*)&m_b4);
+      print_shorts("mb5",(int16_t*)&m_b5);
+      print_shorts("mb6",(int16_t*)&m_b6);
+      print_shorts("mb7",(int16_t*)&m_b7);
+
+      printf("Loop index %d, new\n",k);
+      print_shorts("new0",(int16_t*)&new0);
+      print_shorts("new1",(int16_t*)&new1);
+      print_shorts("new2",(int16_t*)&new2);
+      print_shorts("new3",(int16_t*)&new3);
+      print_shorts("new4",(int16_t*)&new4);
+      print_shorts("new5",(int16_t*)&new5);
+      print_shorts("new6",(int16_t*)&new6);
+      print_shorts("new7",(int16_t*)&new7);
+
+      printf("Loop index %d, after max\n",k);
+      print_shorts("a0",(int16_t*)&a0);
+      print_shorts("a1",(int16_t*)&a1);
+      print_shorts("a2",(int16_t*)&a2);
+      print_shorts("a3",(int16_t*)&a3);
+      print_shorts("a4",(int16_t*)&a4);
+      print_shorts("a5",(int16_t*)&a5);
+      print_shorts("a6",(int16_t*)&a6);
+      print_shorts("a7",(int16_t*)&a7);
+
+      printf("Loop index %d\n",k);
+      print_shorts("a0",(int16_t*)&alpha_ptr[0]);
+      print_shorts("a1",(int16_t*)&alpha_ptr[1]);
+      print_shorts("a2",(int16_t*)&alpha_ptr[2]);
+      print_shorts("a3",(int16_t*)&alpha_ptr[3]);
+      print_shorts("a4",(int16_t*)&alpha_ptr[4]);
+      print_shorts("a5",(int16_t*)&alpha_ptr[5]);
+      print_shorts("a6",(int16_t*)&alpha_ptr[6]);
+      print_shorts("a7",(int16_t*)&alpha_ptr[7]);
+#endif
 
     }
 
@@ -313,12 +443,22 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
 {
 
   int k,rerun_flag=0;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i m11_128,m10_128;
   __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
   __m128i new0,new1,new2,new3,new4,new5,new6,new7;
 
   __m128i *beta128,*alpha128,*beta_ptr;
   __m128i beta_max;
+#elif defined(__arm__)
+  int16x8_t m11_128,m10_128;
+  int16x8_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
+  int16x8_t new0,new1,new2,new3,new4,new5,new6,new7;
+
+  int16x8_t *beta128,*alpha128,*beta_ptr;
+  int16x8_t beta_max;
+#endif
+
   int16_t m11,m10,beta0_16,beta1_16,beta2_16,beta3_16,beta4_16,beta5_16,beta6_16,beta7_16,beta0_2,beta1_2,beta2_2,beta3_2,beta_m;
   llr_t beta0,beta1;
 
@@ -380,9 +520,13 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
   beta7_16=beta7_16-beta_m;
 
   for (rerun_flag=0;; rerun_flag=1) {
+#if defined(__x86_64__) || defined(__i386__)
     beta_ptr   = (__m128i*)&beta[frame_length<<3];
     alpha128   = (__m128i*)&alpha[0];
-
+#elif defined(__arm__)
+    beta_ptr   = (int16x8_t*)&beta[frame_length<<3];
+    alpha128   = (int16x8_t*)&alpha[0];
+#endif
     if (rerun_flag == 0) {
       beta_ptr[0] = alpha128[(frame_length)];
       beta_ptr[1] = alpha128[1+(frame_length)];
@@ -393,6 +537,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
       beta_ptr[6] = alpha128[6+(frame_length)];
       beta_ptr[7] = alpha128[7+(frame_length)];
     } else {
+#if defined(__x86_64__) || defined(__i386__)
       beta128 = (__m128i*)&beta[0];
       beta_ptr[0] = _mm_srli_si128(beta128[0],2);
       beta_ptr[1] = _mm_srli_si128(beta128[1],2);
@@ -402,9 +547,22 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
       beta_ptr[5] = _mm_srli_si128(beta128[5],2);
       beta_ptr[6] = _mm_srli_si128(beta128[6],2);
       beta_ptr[7] = _mm_srli_si128(beta128[7],2);
+#elif defined(__arm__)
+      beta128 = (int16x8_t*)&beta[0];
+      beta_ptr   = (int16x8_t*)&beta[frame_length<<3];
+      beta_ptr[0] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[0],16);   beta_ptr[0] = vsetq_lane_s16(beta[3],beta_ptr[0],4);
+      beta_ptr[1] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[1],16);   beta_ptr[1] = vsetq_lane_s16(beta[11],beta_ptr[1],4);
+      beta_ptr[2] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[2],16);   beta_ptr[2] = vsetq_lane_s16(beta[19],beta_ptr[2],4);
+      beta_ptr[3] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[3],16);   beta_ptr[3] = vsetq_lane_s16(beta[27],beta_ptr[3],4);
+      beta_ptr[4] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[4],16);   beta_ptr[4] = vsetq_lane_s16(beta[35],beta_ptr[4],4);
+      beta_ptr[5] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[5],16);   beta_ptr[5] = vsetq_lane_s16(beta[43],beta_ptr[5],4);
+      beta_ptr[6] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[6],16);   beta_ptr[6] = vsetq_lane_s16(beta[51],beta_ptr[6],4);
+      beta_ptr[7] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[7],16);   beta_ptr[7] = vsetq_lane_s16(beta[59],beta_ptr[7],4);
+#endif
     }
 
 
+#if defined(__x86_64__) || defined(__i386__)
     beta_ptr[0] = _mm_insert_epi16(beta_ptr[0],beta0_16,7);
     beta_ptr[1] = _mm_insert_epi16(beta_ptr[1],beta1_16,7);
     beta_ptr[2] = _mm_insert_epi16(beta_ptr[2],beta2_16,7);
@@ -413,10 +571,21 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
     beta_ptr[5] = _mm_insert_epi16(beta_ptr[5],beta5_16,7);
     beta_ptr[6] = _mm_insert_epi16(beta_ptr[6],beta6_16,7);
     beta_ptr[7] = _mm_insert_epi16(beta_ptr[7],beta7_16,7);
+#elif defined(__arm__)
+    beta_ptr[0] = vsetq_lane_s16(beta0_16,beta_ptr[0],7);
+    beta_ptr[1] = vsetq_lane_s16(beta1_16,beta_ptr[1],7);
+    beta_ptr[2] = vsetq_lane_s16(beta2_16,beta_ptr[2],7);
+    beta_ptr[3] = vsetq_lane_s16(beta3_16,beta_ptr[3],7);
+    beta_ptr[4] = vsetq_lane_s16(beta4_16,beta_ptr[4],7);
+    beta_ptr[5] = vsetq_lane_s16(beta5_16,beta_ptr[5],7);
+    beta_ptr[6] = vsetq_lane_s16(beta6_16,beta_ptr[6],7);
+    beta_ptr[7] = vsetq_lane_s16(beta7_16,beta_ptr[7],7);
+#endif
 
     int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
 
     for (k=(frame_length>>3)-1; k>=loopval; k--) {
+#if defined(__x86_64__) || defined(__i386__)
       m11_128=((__m128i*)m_11)[k];
       m10_128=((__m128i*)m_10)[k];
 
@@ -465,7 +634,55 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
       beta_ptr[5] = _mm_subs_epi16(beta_ptr[5],beta_max);
       beta_ptr[6] = _mm_subs_epi16(beta_ptr[6],beta_max);
       beta_ptr[7] = _mm_subs_epi16(beta_ptr[7],beta_max);
+#elif defined(__arm__)
+      m11_128=((int16x8_t*)m_11)[k];
+      m10_128=((int16x8_t*)m_10)[k];
+      m_b0 = vqaddq_s16(beta_ptr[4],m11_128);  //m11
+      m_b1 = vqsubq_s16(beta_ptr[4],m11_128);  //m00
+      m_b2 = vqsubq_s16(beta_ptr[5],m10_128);  //m01
+      m_b3 = vqaddq_s16(beta_ptr[5],m10_128);  //m10
+      m_b4 = vqaddq_s16(beta_ptr[6],m10_128);  //m10
+      m_b5 = vqsubq_s16(beta_ptr[6],m10_128);  //m01
+      m_b6 = vqsubq_s16(beta_ptr[7],m11_128);  //m00
+      m_b7 = vqaddq_s16(beta_ptr[7],m11_128);  //m11
+
+      new0 = vqsubq_s16(beta_ptr[0],m11_128);  //m00
+      new1 = vqaddq_s16(beta_ptr[0],m11_128);  //m11
+      new2 = vqaddq_s16(beta_ptr[1],m10_128);  //m10
+      new3 = vqsubq_s16(beta_ptr[1],m10_128);  //m01
+      new4 = vqsubq_s16(beta_ptr[2],m10_128);  //m01
+      new5 = vqaddq_s16(beta_ptr[2],m10_128);  //m10
+      new6 = vqaddq_s16(beta_ptr[3],m11_128);  //m11
+      new7 = vqsubq_s16(beta_ptr[3],m11_128);  //m00
+
+      beta_ptr-=8;
 
+      beta_ptr[0] = vmaxq_s16(m_b0,new0);
+      beta_ptr[1] = vmaxq_s16(m_b1,new1);
+      beta_ptr[2] = vmaxq_s16(m_b2,new2);
+      beta_ptr[3] = vmaxq_s16(m_b3,new3);
+      beta_ptr[4] = vmaxq_s16(m_b4,new4);
+      beta_ptr[5] = vmaxq_s16(m_b5,new5);
+      beta_ptr[6] = vmaxq_s16(m_b6,new6);
+      beta_ptr[7] = vmaxq_s16(m_b7,new7);
+
+      beta_max = vmaxq_s16(beta_ptr[0],beta_ptr[1]);
+      beta_max = vmaxq_s16(beta_max   ,beta_ptr[2]);
+      beta_max = vmaxq_s16(beta_max   ,beta_ptr[3]);
+      beta_max = vmaxq_s16(beta_max   ,beta_ptr[4]);
+      beta_max = vmaxq_s16(beta_max   ,beta_ptr[5]);
+      beta_max = vmaxq_s16(beta_max   ,beta_ptr[6]);
+      beta_max = vmaxq_s16(beta_max   ,beta_ptr[7]);
+
+      beta_ptr[0] = vqsubq_s16(beta_ptr[0],beta_max);
+      beta_ptr[1] = vqsubq_s16(beta_ptr[1],beta_max);
+      beta_ptr[2] = vqsubq_s16(beta_ptr[2],beta_max);
+      beta_ptr[3] = vqsubq_s16(beta_ptr[3],beta_max);
+      beta_ptr[4] = vqsubq_s16(beta_ptr[4],beta_max);
+      beta_ptr[5] = vqsubq_s16(beta_ptr[5],beta_max);
+      beta_ptr[6] = vqsubq_s16(beta_ptr[6],beta_max);
+      beta_ptr[7] = vqsubq_s16(beta_ptr[7],beta_max);
+#endif
 
 
     }
@@ -477,6 +694,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
 
 void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,unsigned short frame_length)
 {
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *alpha128=(__m128i *)alpha;
   __m128i *beta128=(__m128i *)beta;
   __m128i *m11_128,*m10_128,*ext_128;
@@ -485,6 +703,17 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
   __m128i m01_1,m01_2,m01_3,m01_4;
   __m128i m10_1,m10_2,m10_3,m10_4;
   __m128i m11_1,m11_2,m11_3,m11_4;
+#elif defined(__arm__)
+  int16x8_t *alpha128=(int16x8_t *)alpha;
+  int16x8_t *beta128=(int16x8_t *)beta;
+  int16x8_t *m11_128,*m10_128,*ext_128;
+  int16x8_t *alpha_ptr,*beta_ptr;
+  int16x8_t m00_1,m00_2,m00_3,m00_4;
+  int16x8_t m01_1,m01_2,m01_3,m01_4;
+  int16x8_t m10_1,m10_2,m10_3,m10_4;
+  int16x8_t m11_1,m11_2,m11_3,m11_4;
+#endif
+
   int k;
 
   //
@@ -501,9 +730,11 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
 
   for (k=0; k<(frame_length>>3); k++) {
 
+#if defined(__x86_64__) || defined(__i386__)
     m11_128        = (__m128i*)&m_11[k<<3];
     m10_128        = (__m128i*)&m_10[k<<3];
     ext_128        = (__m128i*)&ext[k<<3];
+
     /*
       printf("EXT %03d\n",k);
       print_shorts("a0:",&alpha_ptr[0]);
@@ -594,7 +825,54 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
       print_shorts("m01_1:",&m01_1);
       print_shorts("syst:",systematic_128);
     */
-
+#elif defined(__arm__)
+    m11_128        = (int16x8_t*)&m_11[k<<3];
+    m10_128        = (int16x8_t*)&m_10[k<<3];
+    ext_128        = (int16x8_t*)&ext[k<<3];
+
+    m00_4 = vqaddq_s16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
+    m11_4 = vqaddq_s16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
+    m00_3 = vqaddq_s16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
+    m11_3 = vqaddq_s16(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
+    m00_2 = vqaddq_s16(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
+    m11_2 = vqaddq_s16(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
+    m11_1 = vqaddq_s16(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
+    m00_1 = vqaddq_s16(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
+    m01_4 = vqaddq_s16(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
+    m10_4 = vqaddq_s16(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
+    m01_3 = vqaddq_s16(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
+    m10_3 = vqaddq_s16(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
+    m01_2 = vqaddq_s16(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
+    m10_2 = vqaddq_s16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
+    m10_1 = vqaddq_s16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
+    m01_1 = vqaddq_s16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
+
+    m01_1 = vmaxq_s16(m01_1,m01_2);
+    m01_1 = vmaxq_s16(m01_1,m01_3);
+    m01_1 = vmaxq_s16(m01_1,m01_4);
+    m00_1 = vmaxq_s16(m00_1,m00_2);
+    m00_1 = vmaxq_s16(m00_1,m00_3);
+    m00_1 = vmaxq_s16(m00_1,m00_4);
+    m10_1 = vmaxq_s16(m10_1,m10_2);
+    m10_1 = vmaxq_s16(m10_1,m10_3);
+    m10_1 = vmaxq_s16(m10_1,m10_4);
+    m11_1 = vmaxq_s16(m11_1,m11_2);
+    m11_1 = vmaxq_s16(m11_1,m11_3);
+    m11_1 = vmaxq_s16(m11_1,m11_4);
+
+
+    m01_1 = vqsubq_s16(m01_1,*m10_128);
+    m00_1 = vqsubq_s16(m00_1,*m11_128);
+    m10_1 = vqaddq_s16(m10_1,*m10_128);
+    m11_1 = vqaddq_s16(m11_1,*m11_128);
+
+
+    m01_1 = vmaxq_s16(m01_1,m00_1);
+    m10_1 = vmaxq_s16(m10_1,m11_1);
+
+
+    *ext_128 = vqsubq_s16(m10_1,m01_1);
+#endif
     alpha_ptr+=8;
     beta_ptr+=8;
   }
@@ -703,15 +981,23 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
 
   int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
   llr_t *s,*s1,*s2,*yp1,*yp2,*yp;
-  __m128i *yp128;
   unsigned int i,j,iind;//,pi;
   unsigned char iteration_cnt=0;
   unsigned int crc,oldcrc,crc_len;
   uint8_t temp;
 
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i *yp128;
   __m128i tmp, zeros=_mm_setzero_si128();
-
   register __m128i tmpe;
+#elif defined(__arm__)
+  int16x8_t *yp128;
+//  int16x8_t tmp128[(n+8)>>3];
+  int16x8_t tmp, zeros=vdupq_n_s16(0);
+  const uint16_t __attribute__ ((aligned (16))) _Powers[8]= 
+    { 1, 2, 4, 8, 16, 32, 64, 128};
+  uint16x8_t Powers= vld1q_u16(_Powers);
+#endif
   int offset8_flag=0;
 
   if (crc_type > 3) {
@@ -749,7 +1035,11 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
     crc_len=3;
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   yp128 = (__m128i*)y;
+#elif defined(__arm__)
+  yp128 = (int16x8_t*)y;
+#endif
 
 
 
@@ -767,7 +1057,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
 
     j=pi2_p[0];
 
-
+#if defined(__x86_64__) || defined(__i386__)
     tmpe = _mm_load_si128(yp128);
 
     s[j]   = _mm_extract_epi16(tmpe,0);
@@ -826,6 +1116,46 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
     yp2[j] = _mm_extract_epi16(tmpe,7);
     //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
 
+#elif defined(__arm__)
+    s[j]   = vgetq_lane_s16(yp128[0],0);
+    yp1[j] = vgetq_lane_s16(yp128[0],1);
+    yp2[j] = vgetq_lane_s16(yp128[0],2);
+
+    j=pi2_p[1];
+    s[j]   = vgetq_lane_s16(yp128[0],3);
+    yp1[j] = vgetq_lane_s16(yp128[0],4);
+    yp2[j] = vgetq_lane_s16(yp128[0],5);
+
+    j=pi2_p[2];
+    s[j]   = vgetq_lane_s16(yp128[0],6);
+    yp1[j] = vgetq_lane_s16(yp128[0],7);
+    yp2[j] = vgetq_lane_s16(yp128[1],0);
+
+    j=pi2_p[3];
+    s[j]   = vgetq_lane_s16(yp128[1],1);
+    yp1[j] = vgetq_lane_s16(yp128[1],2);
+    yp2[j] = vgetq_lane_s16(yp128[1],3);
+
+    j=pi2_p[4];
+    s[j]   = vgetq_lane_s16(yp128[1],4);
+    yp1[j] = vgetq_lane_s16(yp128[1],5);
+    yp2[j] = vgetq_lane_s16(yp128[1],6);
+
+    j=pi2_p[5];
+    s[j]   = vgetq_lane_s16(yp128[1],7);
+    yp1[j] = vgetq_lane_s16(yp128[2],0);
+    yp2[j] = vgetq_lane_s16(yp128[2],1);
+
+    j=pi2_p[6];
+    s[j]   = vgetq_lane_s16(yp128[2],2);
+    yp1[j] = vgetq_lane_s16(yp128[2],3);
+    yp2[j] = vgetq_lane_s16(yp128[2],4);
+
+    j=pi2_p[7];
+    s[j]   = vgetq_lane_s16(yp128[2],5);
+    yp1[j] = vgetq_lane_s16(yp128[2],6);
+    yp2[j] = vgetq_lane_s16(yp128[2],7);
+#endif
     yp128+=3;
 
   }
@@ -879,7 +1209,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
     pi4_p=pi4tab16[iind];
 
     for (i=0; i<(n>>3); i++) { // steady-state portion
-
+#if defined(__x86_64__) || defined(__i386__)
       ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],0);
       ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],1);
       ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],2);
@@ -888,6 +1218,16 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
       ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],5);
       ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],6);
       ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],7);
+#elif defined(__arm__)
+      ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],0);
+      ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],1);
+      ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],2);
+      ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],3);
+      ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],4);
+      ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],5);
+      ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],6);
+      ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],7);
+#endif
     }
 
     stop_meas(intl1_stats);
@@ -901,6 +1241,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
     pi5_p=pi5tab16[iind];
 
     for (i=0; i<(n>>3); i++) {
+#if defined(__x86_64__) || defined(__i386__)
       tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],0);
       tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],1);
       tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],2);
@@ -910,6 +1251,17 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
       tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],6);
       tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],7);
       ((__m128i *)systematic1)[i] = _mm_adds_epi16(_mm_subs_epi16(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
+#elif defined(__arm__)
+        tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,0);
+        tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,1);
+        tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,2);
+        tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,3);
+        tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,4);
+        tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,5);
+        tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,6);
+        tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,7);
+	((int16x8_t *)systematic1)[i] = vqaddq_s16(vqsubq_s16(tmp,((int16x8_t*)ext)[i]),((int16x8_t *)systematic0)[i]);
+#endif
     }
 
     if (iteration_cnt>1) {
@@ -917,6 +1269,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
       pi6_p=pi6tab16[iind];
 
       for (i=0; i<(n>>3); i++) {
+#if defined(__x86_64__) || defined(__i386__)
         tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],7);
         tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],6);
         tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],5);
@@ -927,7 +1280,24 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
         tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],0);
         tmp=_mm_cmpgt_epi8(_mm_packs_epi16(tmp,zeros),zeros);
         decoded_bytes[i]=(unsigned char)_mm_movemask_epi8(tmp);
-
+#elif defined(__arm__)
+        tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,7);
+        tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,6);
+        tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,5);
+        tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,4);
+        tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,3);
+        tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,2);
+        tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,1);
+        tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,0);
+// This does:
+// [1 2 4 8 16 32 64 128] .* I(ext_i > 0) = 2.^[b0 b1 b2 b3 b4 b5 b6 b7], where bi =I(ext_i > 0)
+// [2^b0 + 2^b1 2^b2 + 2^b3 2^b4 + 2^b5 2^b6 + 2^b7]
+// [2^b0 + 2^b1 + 2^b2 + 2^b3   2^b4 + 2^b5 + 2^b6 + 2^b7] 
+// Mask64 = 2^b0 + 2^b1 + 2^b2 + 2^b3 + 2^b4 + 2^b5 + 2^b6 + 2^b7
+	uint64x2_t Mask   = vpaddlq_u32(vpaddlq_u16(vandq_u16(vcgtq_s16(tmp,zeros), Powers)));
+        uint64x1_t Mask64 = vget_high_u64(Mask)+vget_low_u64(Mask);
+        decoded_bytes[i] = (uint8_t)Mask64;
+#endif
       }
     }
 
@@ -983,14 +1353,23 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
     // do log_map from first parity bit
     if (iteration_cnt < max_iterations) {
       log_map16(systematic1,yparity1,m11,m10,alpha,beta,ext,n,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
+#if defined(__x86_64__) || defined(__i386__)
       __m128i* ext_128=(__m128i*) ext;
       __m128i* s1_128=(__m128i*) systematic1;
       __m128i* s0_128=(__m128i*) systematic0;
-
+#elif defined(__arm__)
+      int16x8_t* ext_128=(int16x8_t*) ext;
+      int16x8_t* s1_128=(int16x8_t*) systematic1;
+      int16x8_t* s0_128=(int16x8_t*) systematic0;
+#endif
       int myloop=n>>3;
 
       for (i=0; i<myloop; i++) {
+#if defined(__x86_64__) || defined(__i386__)
         *ext_128=_mm_adds_epi16(_mm_subs_epi16(*ext_128,*s1_128++),*s0_128++);
+#elif defined(__arm__)
+        *ext_128=vqaddq_s16(vqsubq_s16(*ext_128,*s1_128++),*s0_128++);
+#endif
         ext_128++;
       }
     }
@@ -998,8 +1377,10 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
 
   //  printf("crc %x, oldcrc %x\n",crc,oldcrc);
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
   return(iteration_cnt);
 }
 
diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
index fecfe8fa7f..806af15086 100644
--- a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
+++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
@@ -100,14 +100,13 @@ void compute_beta8(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, unsigned shor
 void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, unsigned short frame_length);
 
 
-void print_bytes(char *s, __m128i *x)
+void print_bytes(char *s, int8_t *x)
 {
 
-  int8_t *tempb = (int8_t *)x;
 
   printf("%s  : %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,
-         tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7],
-         tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]);
+         x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],
+         x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15]);
 
 }
 
@@ -153,32 +152,47 @@ void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
                     unsigned short frame_length,unsigned char term_flag)
 {
   int k,K1;
+#if defined(__x86_64__)||defined(__i386__)
   __m128i *systematic128 = (__m128i *)systematic;
   __m128i *y_parity128   = (__m128i *)y_parity;
   __m128i *m10_128        = (__m128i *)m10;
   __m128i *m11_128        = (__m128i *)m11;
+#elif defined(__arm__)
+  int8x16_t *systematic128  = (int8x16_t *)systematic;
+  int8x16_t *y_parity128    = (int8x16_t *)y_parity;
+  int8x16_t *m10_128        = (int8x16_t *)m10;
+  int8x16_t *m11_128        = (int8x16_t *)m11;
+#endif
 
 #ifdef DEBUG_LOGMAP
   msg("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
 #endif
 
+#if defined(__x86_64__) || defined(__i386__)
   register __m128i sl,sh,ypl,yph; //K128=_mm_set1_epi8(-128);
+#endif
   K1 = (frame_length>>4);
 
   for (k=0; k<K1; k++) {
+#if defined(__x86_64__) || defined(__i386__)
     sl  = _mm_cvtepi8_epi16(systematic128[k]);
-    sh = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8));
+    sh  = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8));
     ypl = _mm_cvtepi8_epi16(y_parity128[k]);
     yph = _mm_cvtepi8_epi16(_mm_srli_si128(y_parity128[k],8));
     m11_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(sl,ypl),1),
                                  _mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
     m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
                                  _mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
+#elif defined(__arm__)
+    m11_128[k] = vhaddq_s8(systematic128[k],y_parity128[k]);
+    m10_128[k] = vhsubq_s8(systematic128[k],y_parity128[k]);
+#endif
 
   }
 
   // Termination
 
+#if defined(__x86_64__) || defined(__i386__)
   sl  = _mm_cvtepi8_epi16(systematic128[k+term_flag]);
   sh = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8));
   ypl = _mm_cvtepi8_epi16(y_parity128[k+term_flag]);
@@ -187,7 +201,10 @@ void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
                                _mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
   m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
                                _mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
-
+#elif defined(__arm__)
+  m11_128[k] = vhaddq_s8(systematic128[k+term_flag],y_parity128[k]);
+  m10_128[k] = vhsubq_s8(systematic128[k+term_flag],y_parity128[k]);
+#endif
 
 }
 
@@ -196,14 +213,24 @@ void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
 void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned short frame_length,unsigned char F)
 {
   int k,loopval,rerun_flag;
+
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *alpha128=(__m128i *)alpha,*alpha_ptr;
   __m128i *m11p,*m10p;
   __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
   __m128i new0,new1,new2,new3,new4,new5,new6,new7;
   __m128i alpha_max;
-
+#elif defined(__arm__)
+  int8x16_t *alpha128=(int8x16_t *)alpha,*alpha_ptr;
+  int8x16_t *m11p,*m10p;
+  int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
+  int8x16_t new0,new1,new2,new3,new4,new5,new6,new7;
+  int8x16_t alpha_max;
+#endif
   // Set initial state: first colum is known
   // the other columns are unknown, so all states are set to same value
+
+#if defined(__x86_64__) || defined(__i386__)
   alpha128[0] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,0);
   alpha128[1] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
   alpha128[2] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
@@ -212,10 +239,10 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
   alpha128[5] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
   alpha128[6] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
   alpha128[7] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
-
   for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {
 
     alpha_ptr = &alpha128[0];
+
     m11p = (__m128i*)m_11;
     m10p = (__m128i*)m_10;
 
@@ -289,6 +316,95 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
     alpha[112] = -MAX8/2;
 
   }
+#elif defined(__arm__)
+  alpha128[0] = vdupq_n_s8(-MAX8/2);
+  alpha128[0] = vsetq_lane_s8(0,alpha128[0],0);
+  alpha128[1] = vdupq_n_s8(-MAX8/2);
+  alpha128[2] = vdupq_n_s8(-MAX8/2);
+  alpha128[3] = vdupq_n_s8(-MAX8/2);
+  alpha128[4] = vdupq_n_s8(-MAX8/2);
+  alpha128[5] = vdupq_n_s8(-MAX8/2);
+  alpha128[6] = vdupq_n_s8(-MAX8/2);
+  alpha128[7] = vdupq_n_s8(-MAX8/2);
+  for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {
+
+    alpha_ptr = &alpha128[0];
+
+    m11p = (int8x16_t*)m_11;
+    m10p = (int8x16_t*)m_10;
+
+    for (k=0;  k<loopval;  k++) {
+      m_b0 = vqaddq_s8(alpha_ptr[1],*m11p);  // m11
+      m_b4 = vqsubq_s8(alpha_ptr[1],*m11p);  // m00=-m11
+      m_b1 = vqsubq_s8(alpha_ptr[3],*m10p);  // m01=-m10
+      m_b5 = vqaddq_s8(alpha_ptr[3],*m10p);  // m10
+      m_b2 = vqaddq_s8(alpha_ptr[5],*m10p);  // m10
+      m_b6 = vqsubq_s8(alpha_ptr[5],*m10p);  // m01=-m10
+      m_b3 = vqsubq_s8(alpha_ptr[7],*m11p);  // m00=-m11
+      m_b7 = vqaddq_s8(alpha_ptr[7],*m11p);  // m11
+
+      new0 = vqsubq_s8(alpha_ptr[0],*m11p);  // m00=-m11
+      new4 = vqaddq_s8(alpha_ptr[0],*m11p);  // m11
+      new1 = vqaddq_s8(alpha_ptr[2],*m10p);  // m10
+      new5 = vqsubq_s8(alpha_ptr[2],*m10p);  // m01=-m10
+      new2 = vqsubq_s8(alpha_ptr[4],*m10p);  // m01=-m10
+      new6 = vqaddq_s8(alpha_ptr[4],*m10p);  // m10
+      new3 = vqaddq_s8(alpha_ptr[6],*m11p);  // m11
+      new7 = vqsubq_s8(alpha_ptr[6],*m11p);  // m00=-m11
+
+      alpha_ptr += 8;
+      m11p++;
+      m10p++;
+      alpha_ptr[0] = vmaxq_s8(m_b0,new0);
+      alpha_ptr[1] = vmaxq_s8(m_b1,new1);
+      alpha_ptr[2] = vmaxq_s8(m_b2,new2);
+      alpha_ptr[3] = vmaxq_s8(m_b3,new3);
+      alpha_ptr[4] = vmaxq_s8(m_b4,new4);
+      alpha_ptr[5] = vmaxq_s8(m_b5,new5);
+      alpha_ptr[6] = vmaxq_s8(m_b6,new6);
+      alpha_ptr[7] = vmaxq_s8(m_b7,new7);
+
+      // compute and subtract maxima
+      alpha_max = vmaxq_s8(alpha_ptr[0],alpha_ptr[1]);
+      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[2]);
+      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[3]);
+      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[4]);
+      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[5]);
+      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[6]);
+      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[7]);
+
+      alpha_ptr[0] = vqsubq_s8(alpha_ptr[0],alpha_max);
+      alpha_ptr[1] = vqsubq_s8(alpha_ptr[1],alpha_max);
+      alpha_ptr[2] = vqsubq_s8(alpha_ptr[2],alpha_max);
+      alpha_ptr[3] = vqsubq_s8(alpha_ptr[3],alpha_max);
+      alpha_ptr[4] = vqsubq_s8(alpha_ptr[4],alpha_max);
+      alpha_ptr[5] = vqsubq_s8(alpha_ptr[5],alpha_max);
+      alpha_ptr[6] = vqsubq_s8(alpha_ptr[6],alpha_max);
+      alpha_ptr[7] = vqsubq_s8(alpha_ptr[7],alpha_max);
+    }
+
+    // Set intial state for next iteration from the last state
+    // as a column end states are the first states of the next column
+    int K1= frame_length>>1;
+    alpha128[0] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[K1],8);   alpha128[0] = vsetq_lane_s8(alpha[8],alpha128[0],7);
+    alpha128[1] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[1+K1],8); alpha128[1] = vsetq_lane_s8(alpha[24],alpha128[0],7);
+    alpha128[2] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[2+K1],8); alpha128[2] = vsetq_lane_s8(alpha[40],alpha128[0],7);
+    alpha128[3] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[3+K1],8); alpha128[3] = vsetq_lane_s8(alpha[56],alpha128[0],7);
+    alpha128[4] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[4+K1],8); alpha128[4] = vsetq_lane_s8(alpha[72],alpha128[0],7);
+    alpha128[5] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[5+K1],8); alpha128[5] = vsetq_lane_s8(alpha[88],alpha128[0],7);
+    alpha128[6] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[6+K1],8); alpha128[6] = vsetq_lane_s8(alpha[104],alpha128[0],7);
+    alpha128[7] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[7+K1],8); alpha128[7] = vsetq_lane_s8(alpha[120],alpha128[0],7);
+    alpha[16] =  -MAX8/2;
+    alpha[32] = -MAX8/2;
+    alpha[48] = -MAX8/2;
+    alpha[64] = -MAX8/2;
+    alpha[80] = -MAX8/2;
+    alpha[96] = -MAX8/2;
+    alpha[112] = -MAX8/2;
+
+  }
+#endif
+
 
 }
 
@@ -297,13 +413,21 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
 {
 
   int k,rerun_flag, loopval;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i m11_128,m10_128;
   __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
   __m128i new0,new1,new2,new3,new4,new5,new6,new7;
 
   __m128i *beta128,*alpha128,*beta_ptr;
   __m128i beta_max;
+#elif defined(__arm__)
+  int8x16_t m11_128,m10_128;
+  int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
+  int8x16_t new0,new1,new2,new3,new4,new5,new6,new7;
 
+  int8x16_t *beta128,*alpha128,*beta_ptr;
+  int8x16_t beta_max;
+#endif
   llr_t beta0,beta1;
 
   llr_t beta2,beta3,beta4,beta5,beta6,beta7;
@@ -371,8 +495,14 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
 
   // we are supposed to run compute_alpha just before compute_beta
   // so the initial states of backward computation can be set from last value of alpha states (forward computation)
+
+#if defined(__x86_64__) || defined(__i386__)
   beta_ptr   = (__m128i*)&beta[frame_length<<3];
   alpha128   = (__m128i*)&alpha[0];
+#elif defined(__arm__)
+  beta_ptr   = (int8x16_t*)&beta[frame_length<<3];
+  alpha128   = (int8x16_t*)&alpha[0];
+#endif
   beta_ptr[0] = alpha128[(frame_length>>1)];
   beta_ptr[1] = alpha128[1+(frame_length>>1)];
   beta_ptr[2] = alpha128[2+(frame_length>>1)];
@@ -391,6 +521,7 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
       // workaround: init with 0
       beta0 = beta1 = beta2 = beta3 = beta4 = beta5 = beta6 = beta7 = 0;
 
+#if defined(__x86_64__) || defined(__i386__)
       beta_ptr[0] = _mm_insert_epi8(beta_ptr[0],beta0,15);
       beta_ptr[1] = _mm_insert_epi8(beta_ptr[1],beta1,15);
       beta_ptr[2] = _mm_insert_epi8(beta_ptr[2],beta2,15);
@@ -399,12 +530,27 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
       beta_ptr[5] = _mm_insert_epi8(beta_ptr[5],beta5,15);
       beta_ptr[6] = _mm_insert_epi8(beta_ptr[6],beta6,15);
       beta_ptr[7] = _mm_insert_epi8(beta_ptr[7],beta7,15);
+#elif defined(__arm__)
+      beta_ptr[0] = vsetq_lane_s8(beta0,beta_ptr[0],15);
+      beta_ptr[1] = vsetq_lane_s8(beta1,beta_ptr[1],15);
+      beta_ptr[2] = vsetq_lane_s8(beta2,beta_ptr[2],15);
+      beta_ptr[3] = vsetq_lane_s8(beta3,beta_ptr[3],15);
+      beta_ptr[4] = vsetq_lane_s8(beta4,beta_ptr[4],15);
+      beta_ptr[5] = vsetq_lane_s8(beta5,beta_ptr[5],15);
+      beta_ptr[6] = vsetq_lane_s8(beta6,beta_ptr[6],15);
+      beta_ptr[7] = vsetq_lane_s8(beta7,beta_ptr[7],15);
+#endif
     }
 
-    for (k=(frame_length>>4)-1, beta_ptr = (__m128i*)&beta[frame_length<<3] ;
+#if defined(__x86_64__) || defined(__i386__)
+    beta_ptr = (__m128i*)&beta[frame_length<<3];
+#elif defined(__arm__)
+    beta_ptr = (int8x16_t*)&beta[frame_length<<3];
+#endif
+    for (k=(frame_length>>4)-1;
          k>=loopval;
          k--) {
-
+#if defined(__x86_64__) || defined(__i386__)
       m11_128=((__m128i*)m_11)[k];
       m10_128=((__m128i*)m_10)[k];
       m_b0 = _mm_adds_epi8(beta_ptr[4],m11_128);  //m11
@@ -452,12 +598,62 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
       beta_ptr[5] = _mm_subs_epi8(beta_ptr[5],beta_max);
       beta_ptr[6] = _mm_subs_epi8(beta_ptr[6],beta_max);
       beta_ptr[7] = _mm_subs_epi8(beta_ptr[7],beta_max);
+#elif defined(__arm__)
+      m11_128=((int8x16_t*)m_11)[k];
+      m10_128=((int8x16_t*)m_10)[k];
+      m_b0 = vqaddq_s8(beta_ptr[4],m11_128);  //m11
+      m_b1 = vqsubq_s8(beta_ptr[4],m11_128);  //m00
+      m_b2 = vqsubq_s8(beta_ptr[5],m10_128);  //m01
+      m_b3 = vqaddq_s8(beta_ptr[5],m10_128);  //m10
+      m_b4 = vqaddq_s8(beta_ptr[6],m10_128);  //m10
+      m_b5 = vqsubq_s8(beta_ptr[6],m10_128);  //m01
+      m_b6 = vqsubq_s8(beta_ptr[7],m11_128);  //m00
+      m_b7 = vqaddq_s8(beta_ptr[7],m11_128);  //m11
+
+      new0 = vqsubq_s8(beta_ptr[0],m11_128);  //m00
+      new1 = vqaddq_s8(beta_ptr[0],m11_128);  //m11
+      new2 = vqaddq_s8(beta_ptr[1],m10_128);  //m10
+      new3 = vqsubq_s8(beta_ptr[1],m10_128);  //m01
+      new4 = vqsubq_s8(beta_ptr[2],m10_128);  //m01
+      new5 = vqaddq_s8(beta_ptr[2],m10_128);  //m10
+      new6 = vqaddq_s8(beta_ptr[3],m11_128);  //m11
+      new7 = vqsubq_s8(beta_ptr[3],m11_128);  //m00
+
+      beta_ptr-=8;
 
+      beta_ptr[0] = vmaxq_s8(m_b0,new0);
+      beta_ptr[1] = vmaxq_s8(m_b1,new1);
+      beta_ptr[2] = vmaxq_s8(m_b2,new2);
+      beta_ptr[3] = vmaxq_s8(m_b3,new3);
+      beta_ptr[4] = vmaxq_s8(m_b4,new4);
+      beta_ptr[5] = vmaxq_s8(m_b5,new5);
+      beta_ptr[6] = vmaxq_s8(m_b6,new6);
+      beta_ptr[7] = vmaxq_s8(m_b7,new7);
+
+      beta_max = vmaxq_s8(beta_ptr[0],beta_ptr[1]);
+      beta_max = vmaxq_s8(beta_max   ,beta_ptr[2]);
+      beta_max = vmaxq_s8(beta_max   ,beta_ptr[3]);
+      beta_max = vmaxq_s8(beta_max   ,beta_ptr[4]);
+      beta_max = vmaxq_s8(beta_max   ,beta_ptr[5]);
+      beta_max = vmaxq_s8(beta_max   ,beta_ptr[6]);
+      beta_max = vmaxq_s8(beta_max   ,beta_ptr[7]);
+
+      beta_ptr[0] = vqsubq_s8(beta_ptr[0],beta_max);
+      beta_ptr[1] = vqsubq_s8(beta_ptr[1],beta_max);
+      beta_ptr[2] = vqsubq_s8(beta_ptr[2],beta_max);
+      beta_ptr[3] = vqsubq_s8(beta_ptr[3],beta_max);
+      beta_ptr[4] = vqsubq_s8(beta_ptr[4],beta_max);
+      beta_ptr[5] = vqsubq_s8(beta_ptr[5],beta_max);
+      beta_ptr[6] = vqsubq_s8(beta_ptr[6],beta_max);
+      beta_ptr[7] = vqsubq_s8(beta_ptr[7],beta_max);
+#endif
     }
 
     // Set intial state for next iteration from the last state
     // as column last states are the first states of the next column
-    // The initial state of colum 0 is coming from tail bits (to be computed)
+    // The initial state of column 0 is coming from tail bits (to be computed)
+
+#if defined(__x86_64__) || defined(__i386__)
     beta128 = (__m128i*)&beta[0];
     beta_ptr   = (__m128i*)&beta[frame_length<<3];
     beta_ptr[0] = _mm_srli_si128(beta128[0],1);
@@ -468,12 +664,25 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
     beta_ptr[5] = _mm_srli_si128(beta128[5],1);
     beta_ptr[6] = _mm_srli_si128(beta128[6],1);
     beta_ptr[7] = _mm_srli_si128(beta128[7],1);
+#elif defined(__arm__)
+    beta128 = (int8x16_t*)&beta[0];
+    beta_ptr   = (int8x16_t*)&beta[frame_length<<3];
+    beta_ptr[0] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[0],8);   beta_ptr[0] = vsetq_lane_s8(beta[7],beta_ptr[0],8);
+    beta_ptr[1] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[1],8);   beta_ptr[1] = vsetq_lane_s8(beta[23],beta_ptr[1],8);
+    beta_ptr[2] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[2],8);   beta_ptr[2] = vsetq_lane_s8(beta[39],beta_ptr[2],8);
+    beta_ptr[3] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[3],8);   beta_ptr[3] = vsetq_lane_s8(beta[55],beta_ptr[3],8);
+    beta_ptr[4] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[4],8);   beta_ptr[4] = vsetq_lane_s8(beta[71],beta_ptr[4],8);
+    beta_ptr[5] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[5],8);   beta_ptr[5] = vsetq_lane_s8(beta[87],beta_ptr[5],8);
+    beta_ptr[6] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[6],8);   beta_ptr[6] = vsetq_lane_s8(beta[103],beta_ptr[6],8);
+    beta_ptr[7] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[7],8);   beta_ptr[7] = vsetq_lane_s8(beta[119],beta_ptr[7],8);
+#endif
   }
 }
 
 void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,unsigned short frame_length)
 {
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *alpha128=(__m128i *)alpha;
   __m128i *beta128=(__m128i *)beta;
   __m128i *m11_128,*m10_128,*ext_128;
@@ -482,6 +691,16 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l
   __m128i m01_1,m01_2,m01_3,m01_4;
   __m128i m10_1,m10_2,m10_3,m10_4;
   __m128i m11_1,m11_2,m11_3,m11_4;
+#elif defined(__arm__)
+  int8x16_t *alpha128=(int8x16_t *)alpha;
+  int8x16_t *beta128=(int8x16_t *)beta;
+  int8x16_t *m11_128,*m10_128,*ext_128;
+  int8x16_t *alpha_ptr,*beta_ptr;
+  int8x16_t m00_1,m00_2,m00_3,m00_4;
+  int8x16_t m01_1,m01_2,m01_3,m01_4;
+  int8x16_t m10_1,m10_2,m10_3,m10_4;
+  int8x16_t m11_1,m11_2,m11_3,m11_4;
+#endif
   int k;
 
   //
@@ -498,6 +717,8 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l
 
   for (k=0; k<(frame_length>>4); k++) {
 
+#if defined(__x86_64__) || defined(__i386__)
+
     m11_128        = (__m128i*)&m_11[k<<4];
     m10_128        = (__m128i*)&m_10[k<<4];
     ext_128        = (__m128i*)&ext[k<<4];
@@ -547,6 +768,59 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l
 
     alpha_ptr+=8;
     beta_ptr+=8;
+#elif defined(__arm__)
+
+    m11_128        = (int8x16_t*)&m_11[k<<4];
+    m10_128        = (int8x16_t*)&m_10[k<<4];
+    ext_128        = (int8x16_t*)&ext[k<<4];
+
+    m00_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
+    m11_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
+    m00_3 = vqaddq_s8(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
+    m11_3 = vqaddq_s8(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
+    m00_2 = vqaddq_s8(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
+    m11_2 = vqaddq_s8(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
+    m11_1 = vqaddq_s8(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
+    m00_1 = vqaddq_s8(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
+    m01_4 = vqaddq_s8(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
+    m10_4 = vqaddq_s8(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
+    m01_3 = vqaddq_s8(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
+    m10_3 = vqaddq_s8(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
+    m01_2 = vqaddq_s8(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
+    m10_2 = vqaddq_s8(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
+    m10_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
+    m01_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
+
+    m01_1 = vmaxq_s8(m01_1,m01_2);
+    m01_1 = vmaxq_s8(m01_1,m01_3);
+    m01_1 = vmaxq_s8(m01_1,m01_4);
+    m00_1 = vmaxq_s8(m00_1,m00_2);
+    m00_1 = vmaxq_s8(m00_1,m00_3);
+    m00_1 = vmaxq_s8(m00_1,m00_4);
+    m10_1 = vmaxq_s8(m10_1,m10_2);
+    m10_1 = vmaxq_s8(m10_1,m10_3);
+    m10_1 = vmaxq_s8(m10_1,m10_4);
+    m11_1 = vmaxq_s8(m11_1,m11_2);
+    m11_1 = vmaxq_s8(m11_1,m11_3);
+    m11_1 = vmaxq_s8(m11_1,m11_4);
+
+
+    m01_1 = vqsubq_s8(m01_1,*m10_128);
+    m00_1 = vqsubq_s8(m00_1,*m11_128);
+    m10_1 = vqaddq_s8(m10_1,*m10_128);
+    m11_1 = vqaddq_s8(m11_1,*m11_128);
+
+
+    m01_1 = vmaxq_s8(m01_1,m00_1);
+    m10_1 = vmaxq_s8(m10_1,m11_1);
+
+
+    *ext_128 = vqsubq_s8(m10_1,m01_1);
+
+    alpha_ptr+=8;
+    beta_ptr+=8;
+
+#endif
   }
 
 
@@ -661,14 +935,25 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
   //  int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
   int *pi4_p,*pi5_p,*pi6_p;
   llr_t *s,*s1,*s2,*yp1,*yp2,*yp;
-  __m128i *yp128;
+
   unsigned int i,j,iind;//,pi;
   unsigned char iteration_cnt=0;
   unsigned int crc,oldcrc,crc_len;
   uint8_t temp;
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i *yp128;
   __m128i tmp128[(n+8)>>3];
   __m128i tmp, zeros=_mm_setzero_si128();
-
+#elif defined(__arm__)
+  int8x16_t *yp128;
+  int8x16_t tmp128[(n+8)>>3];
+  int8x16_t tmp, zeros=vdupq_n_s8(0);
+  const uint8_t __attribute__ ((aligned (16))) _Powers[16]= 
+    { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
+  
+  // Set the powers of 2 (do it once for all, if applicable)
+  uint8x16_t Powers= vld1q_u8(_Powers);
+#endif
 
   int offset8_flag=0;
 
@@ -713,6 +998,8 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
     crc_len=3;
   }
 
+#if defined(__x86_64__) || defined(__i386__)
+
   __m128i avg=_mm_set1_epi32(0);
 
   for (i=0; i<(3*(n>>4))+1; i++) {
@@ -721,7 +1008,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
     avg=_mm_add_epi32(_mm_cvtepi16_epi32(tmp),avg);
   }
 
-  int round_avg=(_mm_extract_epi32(avg,0)+_mm_extract_epi32(avg,1)+_mm_extract_epi32(avg,2)+_mm_extract_epi32(avg,3))/(n*3);
+  int32_t round_avg=(_mm_extract_epi32(avg,0)+_mm_extract_epi32(avg,1)+_mm_extract_epi32(avg,2)+_mm_extract_epi32(avg,3))/(n*3);
 
   //printf("avg input turbo: %d sum %d taille bloc %d\n",round_avg,round_sum,n);
 
@@ -740,6 +1027,35 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
 
   yp128 = (__m128i*)y8;
 
+#elif defined(__arm__)
+
+  int32x4_t avg=vdupq_n_s32(0);
+
+  for (i=0; i<(3*(n>>4))+1; i++) {
+    int16x8_t tmp=vabsq_s16(((int16x8_t*)y)[i]);
+    avg = vqaddq_s32(avg,vaddl_s16(((int16x4_t*)&tmp)[0],((int16x4_t*)&tmp)[1]));
+  }
+
+  int32_t round_avg=(vgetq_lane_s32(avg,0)+vgetq_lane_s32(avg,1)+vgetq_lane_s32(avg,2)+vgetq_lane_s32(avg,3))/(n*3);
+
+  //printf("avg input turbo: %d sum %d taille bloc %d\n",round_avg,round_sum,n);
+
+  if (round_avg < 16 )
+    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
+      ((int8x8_t *)y8)[i] = vqmovn_s16(((int16x8_t *)y)[j]);
+  else if (round_avg < 32)
+    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
+      ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],1));
+  else if (round_avg < 64 )
+    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
+      ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],2));
+  else
+    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
+      ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],3));
+
+  yp128 = (int8x16_t*)y8;
+
+#endif
 
   s = systematic0;
   s1 = systematic1;
@@ -764,101 +1080,198 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
     pi2_p = &pi2tab8[iind][i];
 
     j=pi2_p[0];
+#if defined(__x86_64__) || defined(__i386__)
     s[j]   = _mm_extract_epi8(yp128[0],0);
     yp1[j] = _mm_extract_epi8(yp128[0],1);
     yp2[j] = _mm_extract_epi8(yp128[0],2);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[1];
     s[j]   = _mm_extract_epi8(yp128[0],3);
     yp1[j] = _mm_extract_epi8(yp128[0],4);
     yp2[j] = _mm_extract_epi8(yp128[0],5);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[2];
     s[j]   = _mm_extract_epi8(yp128[0],6);
     yp1[j] = _mm_extract_epi8(yp128[0],7);
     yp2[j] = _mm_extract_epi8(yp128[0],8);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[3];
     s[j]   = _mm_extract_epi8(yp128[0],9);
     yp1[j] = _mm_extract_epi8(yp128[0],10);
     yp2[j] = _mm_extract_epi8(yp128[0],11);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[4];
     s[j]   = _mm_extract_epi8(yp128[0],12);
     yp1[j] = _mm_extract_epi8(yp128[0],13);
     yp2[j] = _mm_extract_epi8(yp128[0],14);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[5];
     s[j]   = _mm_extract_epi8(yp128[0],15);
     yp1[j] = _mm_extract_epi8(yp128[1],0);
     yp2[j] = _mm_extract_epi8(yp128[1],1);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[6];
     s[j]   = _mm_extract_epi8(yp128[1],2);
     yp1[j] = _mm_extract_epi8(yp128[1],3);
     yp2[j] = _mm_extract_epi8(yp128[1],4);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[7];
     s[j]   = _mm_extract_epi8(yp128[1],5);
     yp1[j] = _mm_extract_epi8(yp128[1],6);
     yp2[j] = _mm_extract_epi8(yp128[1],7);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[8];
     s[j]   = _mm_extract_epi8(yp128[1],8);
     yp1[j] = _mm_extract_epi8(yp128[1],9);
     yp2[j] = _mm_extract_epi8(yp128[1],10);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[9];
     s[j]   = _mm_extract_epi8(yp128[1],11);
     yp1[j] = _mm_extract_epi8(yp128[1],12);
     yp2[j] = _mm_extract_epi8(yp128[1],13);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[10];
     s[j]   = _mm_extract_epi8(yp128[1],14);
     yp1[j] = _mm_extract_epi8(yp128[1],15);
     yp2[j] = _mm_extract_epi8(yp128[2],0);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[11];
     s[j]   = _mm_extract_epi8(yp128[2],1);
     yp1[j] = _mm_extract_epi8(yp128[2],2);
     yp2[j] = _mm_extract_epi8(yp128[2],3);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[12];
     s[j]   = _mm_extract_epi8(yp128[2],4);
     yp1[j] = _mm_extract_epi8(yp128[2],5);
     yp2[j] = _mm_extract_epi8(yp128[2],6);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[13];
     s[j]   = _mm_extract_epi8(yp128[2],7);
     yp1[j] = _mm_extract_epi8(yp128[2],8);
     yp2[j] = _mm_extract_epi8(yp128[2],9);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[14];
     s[j]   = _mm_extract_epi8(yp128[2],10);
     yp1[j] = _mm_extract_epi8(yp128[2],11);
     yp2[j] = _mm_extract_epi8(yp128[2],12);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
+
 
     j=pi2_p[15];
     s[j]   = _mm_extract_epi8(yp128[2],13);
     yp1[j] = _mm_extract_epi8(yp128[2],14);
     yp2[j] = _mm_extract_epi8(yp128[2],15);
-    //    printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
 
+
+#elif defined(__arm__)
+    s[j]   = vgetq_lane_s8(yp128[0],0);
+    yp1[j] = vgetq_lane_s8(yp128[0],1);
+    yp2[j] = vgetq_lane_s8(yp128[0],2);
+
+
+    j=pi2_p[1];
+    s[j]   = vgetq_lane_s8(yp128[0],3);
+    yp1[j] = vgetq_lane_s8(yp128[0],4);
+    yp2[j] = vgetq_lane_s8(yp128[0],5);
+
+
+    j=pi2_p[2];
+    s[j]   = vgetq_lane_s8(yp128[0],6);
+    yp1[j] = vgetq_lane_s8(yp128[0],7);
+    yp2[j] = vgetq_lane_s8(yp128[0],8);
+
+
+    j=pi2_p[3];
+    s[j]   = vgetq_lane_s8(yp128[0],9);
+    yp1[j] = vgetq_lane_s8(yp128[0],10);
+    yp2[j] = vgetq_lane_s8(yp128[0],11);
+
+
+    j=pi2_p[4];
+    s[j]   = vgetq_lane_s8(yp128[0],12);
+    yp1[j] = vgetq_lane_s8(yp128[0],13);
+    yp2[j] = vgetq_lane_s8(yp128[0],14);
+
+
+    j=pi2_p[5];
+    s[j]   = vgetq_lane_s8(yp128[0],15);
+    yp1[j] = vgetq_lane_s8(yp128[1],0);
+    yp2[j] = vgetq_lane_s8(yp128[1],1);
+
+
+    j=pi2_p[6];
+    s[j]   = vgetq_lane_s8(yp128[1],2);
+    yp1[j] = vgetq_lane_s8(yp128[1],3);
+    yp2[j] = vgetq_lane_s8(yp128[1],4);
+
+
+    j=pi2_p[7];
+    s[j]   = vgetq_lane_s8(yp128[1],5);
+    yp1[j] = vgetq_lane_s8(yp128[1],6);
+    yp2[j] = vgetq_lane_s8(yp128[1],7);
+
+
+    j=pi2_p[8];
+    s[j]   = vgetq_lane_s8(yp128[1],8);
+    yp1[j] = vgetq_lane_s8(yp128[1],9);
+    yp2[j] = vgetq_lane_s8(yp128[1],10);
+
+
+    j=pi2_p[9];
+    s[j]   = vgetq_lane_s8(yp128[1],11);
+    yp1[j] = vgetq_lane_s8(yp128[1],12);
+    yp2[j] = vgetq_lane_s8(yp128[1],13);
+
+
+    j=pi2_p[10];
+    s[j]   = vgetq_lane_s8(yp128[1],14);
+    yp1[j] = vgetq_lane_s8(yp128[1],15);
+    yp2[j] = vgetq_lane_s8(yp128[2],0);
+
+
+    j=pi2_p[11];
+    s[j]   = vgetq_lane_s8(yp128[2],1);
+    yp1[j] = vgetq_lane_s8(yp128[2],2);
+    yp2[j] = vgetq_lane_s8(yp128[2],3);
+
+
+    j=pi2_p[12];
+    s[j]   = vgetq_lane_s8(yp128[2],4);
+    yp1[j] = vgetq_lane_s8(yp128[2],5);
+    yp2[j] = vgetq_lane_s8(yp128[2],6);
+
+
+    j=pi2_p[13];
+    s[j]   = vgetq_lane_s8(yp128[2],7);
+    yp1[j] = vgetq_lane_s8(yp128[2],8);
+    yp2[j] = vgetq_lane_s8(yp128[2],9);
+
+
+    j=pi2_p[14];
+    s[j]   = vgetq_lane_s8(yp128[2],10);
+    yp1[j] = vgetq_lane_s8(yp128[2],11);
+    yp2[j] = vgetq_lane_s8(yp128[2],12);
+
+
+    j=pi2_p[15];
+    s[j]   = vgetq_lane_s8(yp128[2],13);
+    yp1[j] = vgetq_lane_s8(yp128[2],14);
+    yp2[j] = vgetq_lane_s8(yp128[2],15);
+
+#endif
     yp128+=3;
 
   }
@@ -925,6 +1338,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
     pi4_p=pi4tab8[iind];
 
     for (i=0; i<(n2>>4); i++) { // steady-state portion
+#if defined(__x86_64__) || defined(__i386__)
       tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],0);
       tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],1);
       tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],2);
@@ -941,6 +1355,24 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
       tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],13);
       tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],14);
       ((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],15);
+#elif defined(__arm__)
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,0);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,1);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,2);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,3);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,4);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,5);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,6);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,7);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,8);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,9);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,10);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,11);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,12);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,13);
+      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,14);
+      ((int8x16_t *)systematic2)[i]=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,15);
+#endif
     }
 
     stop_meas(intl1_stats);
@@ -956,6 +1388,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
 
     if ((n2&0x7f) == 0) {  // n2 is a multiple of 128 bits
       for (i=0; i<(n2>>4); i++) {
+#if defined(__x86_64__) || defined(__i386__)
         tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],0);
         tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],1);
         tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],2);
@@ -974,9 +1407,32 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
         tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
         decoded_bytes_interl[i]=(uint16_t) _mm_movemask_epi8(_mm_cmpgt_epi8(tmp,zeros));
         ((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
+#elif defined(__arm__)
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,3);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,4);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,5);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,6);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,7);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,8);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,9);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,10);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,11);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,12);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,13);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,14);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,15);
+	uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcgtq_s8(tmp,zeros), Powers))));
+	vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0);
+	vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8);
+	((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t*)ext)[i]),((int8x16_t *)systematic0)[i]);
+#endif
       }
     } else {
       for (i=0; i<(n2>>4); i++) {
+#if defined(__x86_64__) || defined(__i386__)
         tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],0);
         tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],1);
         tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],2);
@@ -996,7 +1452,29 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
         tmp128[i] = _mm_adds_epi8(((__m128i *)ext2)[i],((__m128i *)systematic2)[i]);
 
         ((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
-      }
+#elif defined(__arm__)
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,3);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,4);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,5);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,6);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,7);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,8);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,9);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,10);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,11);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,12);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,13);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,14);
+        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,15);
+        tmp128[i] = vqaddq_s8(((int8x16_t *)ext2)[i],((int8x16_t *)systematic2)[i]);
+
+        ((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t*)ext)[i]),((int8x16_t *)systematic0)[i]);
+
+#endif 
+     }
     }
 
     // Check if we decoded the block
@@ -1007,6 +1485,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
 
         // re-order the decoded bits in theregular order
         // as it is presently ordered as 16 sequential columns
+#if defined(__x86__64) || defined(__i386__)
         __m128i* dbytes=(__m128i*)decoded_bytes_interl;
         __m128i shuffle=SHUFFLE16(7,6,5,4,3,2,1,0);
         __m128i mask  __attribute__((aligned(16)));
@@ -1031,10 +1510,31 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
             decoded_bytes[n_128*j +i]=(uint8_t) _mm_movemask_epi8(_mm_packs_epi16(tmp2,zeros));
           }
         }
+#elif defined(__arm__)
+        uint8x16_t* dbytes=(uint8x16_t*)decoded_bytes_interl;
+        uint16x8_t mask  __attribute__((aligned(16)));
+        int n_128=n2>>7;
+
+        for (i=0; i<n_128; i++) {
+          mask=vdupq_n_u16(1);
+          uint8x16_t tmp __attribute__((aligned(16)));
+          tmp=vcombine_u8(vrev64_u8(((uint8x8_t*)&dbytes[i])[1]),vrev64_u8(((uint8x8_t*)&dbytes[i])[0]));
+          vst1q_lane_u8(&decoded_bytes[n_128*0+i],(uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers)))),0);
+
+          int j;
+
+          for (j=1; j<16; j++) {
+            mask=vshlq_n_u16(mask,1);
+	    vst1q_lane_u8(&decoded_bytes[n_128*0+i],(uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers)))),0);
+          }
+        }
+
+#endif
       } else {
         pi6_p=pi6tab8[iind];
 
         for (i=0; i<(n2>>4); i++) {
+#if defined(__x86_64__) || defined(__i386__)
           tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],7);
           tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],6);
           tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],5);
@@ -1053,6 +1553,27 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
           tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],8);
           tmp=_mm_cmpgt_epi8(tmp,zeros);
           ((uint16_t *)decoded_bytes)[i]=(uint16_t)_mm_movemask_epi8(tmp);
+#elif defined(__arm__)
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,7);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,6);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,5);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,4);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,3);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,2);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,1);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,0);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,15);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,14);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,13);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,12);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,11);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,10);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,9);
+          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,8);
+	  uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcgtq_s8(tmp,zeros), Powers))));
+	  vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0);
+	  vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8);
+#endif
         }
       }
 
@@ -1107,17 +1628,28 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
     // do a new iteration if it is not yet decoded
     if (iteration_cnt < max_iterations) {
       log_map8(systematic1,yparity1,m11,m10,alpha,beta,ext,n2,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
+#if defined(__x86_64__) || defined(__i386__)
       __m128i* ext_128=(__m128i*) ext;
       __m128i* s1_128=(__m128i*) systematic1;
       __m128i* s0_128=(__m128i*) systematic0;
+#elif defined(__arm__)
+      int8x16_t* ext_128=(int8x16_t*) ext;
+      int8x16_t* s1_128=(int8x16_t*) systematic1;
+      int8x16_t* s0_128=(int8x16_t*) systematic0;
+#endif
       int myloop=n2>>4;
 
       for (i=0; i<myloop; i++) {
+#if defined(__x86_64__) || defined(__i386__)
         *ext_128=_mm_adds_epi8(_mm_subs_epi8(*ext_128,*s1_128++),*s0_128++);
+#elif defined(__arm__)
+        *ext_128=vqaddq_s8(vqsubq_s8(*ext_128,*s1_128++),*s0_128++);
+#endif
         ext_128++;
       }
     }
   }
 
   return(iteration_cnt);
+
 }
diff --git a/openair1/PHY/CODING/Makefile b/openair1/PHY/CODING/Makefile
index f9b15ebd27..b323c479d3 100644
--- a/openair1/PHY/CODING/Makefile
+++ b/openair1/PHY/CODING/Makefile
@@ -1,29 +1,13 @@
-TURBO_SRC    = 3gpplte.c 3gpplte_turbo_decoder_sse.c crc_byte.c
+TURBO_SRC    = 3gpplte_sse.c 3gpplte_turbo_decoder_sse.c crc_byte.c
 RATE13CC_SRC = ccoding_byte_lte.c viterbi_lte.c crc_byte.c
 RATE12CC_SRC = ccoding_byte.c viterbi.c crc_byte.c
 
-all: turbolte_test rate13cc_test rate12cc_test run_turbo run_rate13cc run_rate13ccdab run_rate12cc
+all: 3gpplte_sse 
 
-turbolte_test: $(TURBO_SRC)
-	gcc -o turbo_test $(TURBO_SRC)  -DTEST_DEBUG -DUSER_MODE -msse2 -mssse3 -Wall
+3gpplte_sse: $(TURBO_SRC)
+	gcc -o 3gpplte_sse 3gpplte_sse.c -msse4 -Wall -g -ggdb -DMAIN
 
-rate13cc_test: $(RATE13CC_SRC)
-	gcc -o rate13cc_test $(RATE13CC_SRC)  -DTEST_DEBUG -DUSER_MODE -msse2 -mssse3 -Wall
 
-rate12cc_test: $(RATE12CC_SRC)
-	gcc -o rate12cc_test $(RATE12CC_SRC)  -DTEST_DEBUG -DUSER_MODE -msse2 -mssse3 -Wall
-
-run_turbo: turbolte_test
-	./turbo_test
-
-run_rate13cc: rate13cc_test
-	./rate13cc_test
-
-run_rate13ccdab: rate13cc_test
-	./rate13cc_test -d
-
-run_rate12cc: rate12cc_test
-	./rate12cc_test
 
 clean: 
 	rm *.o
diff --git a/openair1/PHY/CODING/ccoding_byte_lte.c b/openair1/PHY/CODING/ccoding_byte_lte.c
index d6f31b1ab4..b399d0186b 100644
--- a/openair1/PHY/CODING/ccoding_byte_lte.c
+++ b/openair1/PHY/CODING/ccoding_byte_lte.c
@@ -55,22 +55,22 @@ unsigned char  ccodelte_table_rev[128];  // for receiver
 
 
 void
-ccodelte_encode (unsigned int numbits,
-                 unsigned char add_crc,
-                 unsigned char *inPtr,
-                 unsigned char *outPtr,
-                 unsigned short rnti)
+ccodelte_encode (int32_t numbits,
+                 uint8_t add_crc,
+                 uint8_t *inPtr,
+                 uint8_t *outPtr,
+                 uint16_t rnti)
 {
-  unsigned int             state;
+  uint32_t             state;
 
-  unsigned char              c, out, first_bit;
-  char shiftbit=0;
-  unsigned short c16;
-  unsigned short next_last_byte=0;
-  unsigned int crc=0;
+  uint8_t              c, out, first_bit;
+  int8_t shiftbit=0;
+  uint16_t c16;
+  uint16_t next_last_byte=0;
+  uint32_t crc=0;
 
 #ifdef DEBUG_CCODE
-  unsigned int  dummy=0;
+  uint32_t  dummy=0;
 #endif //DEBUG_CCODE
 
   /* The input bit is shifted in position 8 of the state.
@@ -80,20 +80,19 @@ ccodelte_encode (unsigned int numbits,
   if (add_crc == 1) {
     crc = crc8(inPtr,numbits);
     first_bit      = 2;
-    c = (unsigned char)(crc>>24);
+    c = (uint8_t)(crc>>24);
   } else if (add_crc == 2) {
     crc = crc16(inPtr,numbits);
 #ifdef DEBUG_CCODE
     printf("ccode_lte : crc %x\n",crc);
 #endif
     // scramble with RNTI
-    crc ^= (((unsigned int)rnti)<<16);
+    crc ^= (((uint32_t)rnti)<<16);
 #ifdef DEBUG_CCODE
     printf("ccode_lte : crc %x (rnti %x)\n",crc,rnti);
 #endif
     first_bit      = 2;
-    //    c = (unsigned char)(crc>>24);
-    c = (unsigned char)((crc>>16)&0xff);
+    c = (uint8_t)((crc>>16)&0xff);
   } else {
     next_last_byte = numbits>>3;
     first_bit      = (numbits-6)&7;
@@ -182,7 +181,7 @@ ccodelte_encode (unsigned int numbits,
   // now code 8-bit CRC for UCI
   if (add_crc == 1) {
 
-    c = (unsigned char)(crc>>24);
+    c = (uint8_t)(crc>>24);
 
     //    for (shiftbit = 0; (shiftbit<8);shiftbit++) {
     for (shiftbit = 7; (shiftbit>=0); shiftbit--) {
@@ -209,7 +208,7 @@ ccodelte_encode (unsigned int numbits,
   // now code 16-bit CRC for DCI
   if (add_crc == 2) {
 
-    c16 = (unsigned short)(crc>>16);
+    c16 = (uint16_t)(crc>>16);
 
     //    for (shiftbit = 0; (shiftbit<16);shiftbit++) {
     for (shiftbit = 15; (shiftbit>=0); shiftbit--) {
diff --git a/openair1/PHY/CODING/defs.h b/openair1/PHY/CODING/defs.h
index dbd2e4790d..21767a1f33 100644
--- a/openair1/PHY/CODING/defs.h
+++ b/openair1/PHY/CODING/defs.h
@@ -320,7 +320,7 @@ void threegpplte_turbo_encoder(uint8_t *input,
                                uint16_t interleaver_f2);
 
 
-/** \fn void ccodelte_encode(uint32_t numbits,uint8_t add_crc, uint8_t *inPtr,uint8_t *outPtr,uint16_t rnti)
+/** \fn void ccodelte_encode(int32_t numbits,uint8_t add_crc, uint8_t *inPtr,uint8_t *outPtr,uint16_t rnti)
 \brief This function implements the LTE convolutional code of rate 1/3
   with a constraint length of 7 bits. The inputs are bit packed in octets
 (from MSB to LSB). Trellis tail-biting is included here.
@@ -331,7 +331,7 @@ void threegpplte_turbo_encoder(uint8_t *input,
 @param rnti RNTI for CRC scrambling
 */
 void
-ccodelte_encode (uint32_t numbits,
+ccodelte_encode (int32_t numbits,
                  uint8_t add_crc,
                  uint8_t *inPtr,
                  uint8_t *outPtr,
diff --git a/openair1/PHY/CODING/viterbi.c b/openair1/PHY/CODING/viterbi.c
index 4b0fb0c70a..118e0ef2d8 100755
--- a/openair1/PHY/CODING/viterbi.c
+++ b/openair1/PHY/CODING/viterbi.c
@@ -33,9 +33,8 @@
 */
 
 
-#ifndef EXPRESSMIMO_TARGET
+
 #include "PHY/sse_intrin.h"
-#endif //EXPRESSMIMO_TARGET
 
 extern unsigned char ccodedot11_table[128],ccodedot11_table_rev[128];
 
@@ -46,12 +45,6 @@ static unsigned char inputs[64][2048];
 static unsigned short survivors[64][2048];
 static short partial_metrics[64],partial_metrics_new[64];
 
-#ifdef __KERNEL__
-#define printf rt_printk
-#endif
-
-#ifndef EXPRESSMIMO_TARGET
-
 void phy_viterbi_dot11(char *y,unsigned char *decoded_bytes,unsigned short n)
 {
 
@@ -191,22 +184,34 @@ void phy_generate_viterbi_tables(void)
 
 
 #define INIT0 0x00000080
-#define RESCALE 0x00000040
-
-
-static  __m128i  __attribute__((aligned(16))) TB[4*4095*8];
 
-static  __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,
-        TBodd33_63 __attribute__((aligned(16)));
 
-static  __m128i rescale,min_state,min_state2 __attribute__((aligned(16)));
 
 void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short n,int offset, int traceback )
 {
 
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i  TB[4*4095*8]; // 4 __m128i per input bit (64 states, 8-bits per state = 16-way), 4095 is largest packet size in bytes, 8 bits/byte
+
+  __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
+
+  __m128i min_state,min_state2;
+
 
   __m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[offset<<2];
 
+#elif defined(__arm__)
+  uint8x16x2_t TB[2*4095*8];  // 2 int8x16_t per input bit, 8 bits / byte, 4095 is largest packet size in bytes
+
+  uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
+  uint8x16x2_t metrics0_31,metrics32_63;
+
+  uint8x16_t min_state;
+
+  uint8x16_t *m0_ptr,*m1_ptr;
+  uint8x16x2_t *TB_ptr = &TB[offset<<1];
+
+#endif
 
   char *in = y;
   unsigned char prev_state0;
@@ -216,6 +221,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
   short position;
 
   //  printf("offset %d, TB_ptr %p\n",offset,TB_ptr);
+#if defined(__x86_64__) || defined(__i386__)
   if (offset == 0) {
     // set initial metrics
 
@@ -225,129 +231,64 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
     metrics48_63 = _mm_setzero_si128();
   }
 
-  rescale = _mm_cvtsi32_si128(RESCALE);
+#elif defined(__arm__)
+  if (offset == 0) {
+    // set initial metrics
 
-  /*
-  print_bytes(metrics0_15,"metrics0_15");
-  print_bytes(metrics16_31,"metrics16_31");
-  print_bytes(metrics32_47,"metrics32_47");
-  print_bytes(metrics48_63,"metrics48_63");
-  */
+    metrics0_31.val[0]  = vdupq_n_u8(0); metrics0_31.val[0] = vsetq_lane_u8(INIT0,metrics0_31.val[0],0);
+    metrics0_31.val[1]  = vdupq_n_u8(0);
+    metrics32_63.val[0] = vdupq_n_u8(0);
+    metrics32_63.val[1] = vdupq_n_u8(0);
+  }
 
 
-  for (position=offset; position<(offset+n); position++) {
+#endif
 
+  for (position=offset; position<(offset+n); position++) {
 
     //printf("%d : (%d,%d)\n",position,in[0],in[1]);
 
     // get branch metric offsets for the 64 states
     table_offset = (in[0]+8 + ((in[1]+8)<<4))<<6;
 
-    //    printf("Table_offset = %u (in[0]=%d,in[1]=%d)\n",table_offset,in[0],in[1]);
-
+#if defined(__x86_64__) || defined(__i386__)
     m0_ptr = (__m128i *)&m0_table[table_offset];
     m1_ptr = (__m128i *)&m1_table[table_offset];
 
-    //    printf("\n");
 
     // even states
     even0_30a  = _mm_adds_epu8(metrics0_15,m0_ptr[0]);
-    //    print_bytes(even0_30a,"even0_30a");
-
     even32_62a = _mm_adds_epu8(metrics16_31,m0_ptr[1]);
-    //    print_bytes(even32_62a,"even32_62a");
-
     even0_30b  = _mm_adds_epu8(metrics32_47,m0_ptr[2]);
-    //    print_bytes(even0_30b,"even0_30b");
-
     even32_62b = _mm_adds_epu8(metrics48_63,m0_ptr[3]);
-    //    print_bytes(even32_62b,"even32_62b");
-
-    //    printf("\n");
 
     // odd states
     odd1_31a   = _mm_adds_epu8(metrics0_15,m1_ptr[0]);
-
-    //    print_bytes(odd1_31a,"odd1_31a");
-
     odd33_63a  = _mm_adds_epu8(metrics16_31,m1_ptr[1]);
-
-    //    print_bytes(odd33_63a,"odd33_63a");
-
     odd1_31b   = _mm_adds_epu8(metrics32_47,m1_ptr[2]);
-
-    //    print_bytes(odd1_31b,"odd1_31b");
-
     odd33_63b  = _mm_adds_epu8(metrics48_63,m1_ptr[3]);
-
-    //    print_bytes(odd33_63b,"odd33_63b");
-
-
-
-
     // select maxima
-    //    printf("\n");
-
     even0_30a  = _mm_max_epu8(even0_30a,even0_30b);
-
-    //    print_bytes(even0_30a,"even0_30a");
-
     even32_62a = _mm_max_epu8(even32_62a,even32_62b);
-
-    //    print_bytes(even32_62a,"even32_62a");
-
     odd1_31a   = _mm_max_epu8(odd1_31a,odd1_31b);
-
-    //    print_bytes(odd1_31a,"odd1_31a");
-
     odd33_63a  = _mm_max_epu8(odd33_63a,odd33_63b);
 
-    //    print_bytes(odd33_63a,"odd33_63a");
-
-    //    printf("\n");
     // Traceback information
-
     TBeven0_30  = _mm_cmpeq_epi8(even0_30a,even0_30b);
-
-
     TBeven32_62 = _mm_cmpeq_epi8(even32_62a,even32_62b);
-
-
     TBodd1_31   = _mm_cmpeq_epi8(odd1_31a,odd1_31b);
-
-
     TBodd33_63  = _mm_cmpeq_epi8(odd33_63a,odd33_63b);
 
-
     metrics0_15        = _mm_unpacklo_epi8(even0_30a ,odd1_31a);
     metrics16_31       = _mm_unpackhi_epi8(even0_30a ,odd1_31a);
     metrics32_47       = _mm_unpacklo_epi8(even32_62a,odd33_63a);
     metrics48_63       = _mm_unpackhi_epi8(even32_62a,odd33_63a);
 
-
-    //print_bytes(metrics0_15,"metrics0_15");
-    //print_bytes(metrics16_31,"metrics16_31");
-    //print_bytes(metrics32_47,"metrics32_47");
-    //print_bytes(metrics48_63,"metrics48_63");
-
-
-
-    TB_ptr[0]  = _mm_unpacklo_epi8(TBeven0_30,TBodd1_31);
-
-    //    print_bytes(TB_ptr[0],"TB0_15");
-
+    TB_ptr[0] = _mm_unpacklo_epi8(TBeven0_30,TBodd1_31);
     TB_ptr[1] = _mm_unpackhi_epi8(TBeven0_30,TBodd1_31);
-
-    //    print_bytes(TB_ptr[1],"TB16_31");
-
     TB_ptr[2] = _mm_unpacklo_epi8(TBeven32_62,TBodd33_63);
-
-    //    print_bytes(TB_ptr[2],"TB32_47");
-
     TB_ptr[3] = _mm_unpackhi_epi8(TBeven32_62,TBodd33_63);
 
-    //    print_bytes(TB_ptr[3],"TB48_63");
-
     in+=2;
     TB_ptr += 4;
 
@@ -359,50 +300,92 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
     min_state =_mm_min_epu8(min_state,metrics32_47);
     min_state =_mm_min_epu8(min_state,metrics48_63);
 
-    //    print_bytes(min_state,"min_state");
 
     min_state2 = min_state;
     min_state  = _mm_unpacklo_epi8(min_state,min_state);
     min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
     min_state  = _mm_min_epu8(min_state,min_state2);
 
-    //    print_bytes(min_state,"min_state");
-
     min_state2 = min_state;
     min_state  = _mm_unpacklo_epi8(min_state,min_state);
     min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
     min_state  = _mm_min_epu8(min_state,min_state2);
 
-    //    print_bytes(min_state,"min_state");
-
     min_state2 = min_state;
     min_state  = _mm_unpacklo_epi8(min_state,min_state);
     min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
     min_state  = _mm_min_epu8(min_state,min_state2);
 
-    //    print_bytes(min_state,"min_state");
-
     min_state2 = min_state;
     min_state  = _mm_unpacklo_epi8(min_state,min_state);
     min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
     min_state  = _mm_min_epu8(min_state,min_state2);
 
-    //    print_bytes(min_state,"min_state");
-
     metrics0_15  = _mm_subs_epu8(metrics0_15,min_state);
     metrics16_31 = _mm_subs_epu8(metrics16_31,min_state);
     metrics32_47 = _mm_subs_epu8(metrics32_47,min_state);
     metrics48_63 = _mm_subs_epu8(metrics48_63,min_state);
+#elif defined(__arm__)
+    m0_ptr = (uint8x16_t *)&m0_table[table_offset];
+    m1_ptr = (uint8x16_t *)&m1_table[table_offset];
+
+
+    // even states
+    even0_30a  = vqaddq_u8(metrics0_31.val[0],m0_ptr[0]);
+    even32_62a = vqaddq_u8(metrics0_31.val[1],m0_ptr[1]);
+    even0_30b  = vqaddq_u8(metrics32_63.val[0],m0_ptr[2]);
+    even32_62b = vqaddq_u8(metrics32_63.val[1],m0_ptr[3]);
+
+    // odd states
+    odd1_31a   = vqaddq_u8(metrics0_31.val[0],m1_ptr[0]);
+    odd33_63a  = vqaddq_u8(metrics0_31.val[1],m1_ptr[1]);
+    odd1_31b   = vqaddq_u8(metrics32_63.val[0],m1_ptr[2]);
+    odd33_63b  = vqaddq_u8(metrics32_63.val[1],m1_ptr[3]);
+    // select maxima
+    even0_30a  = vmaxq_u8(even0_30a,even0_30b);
+    even32_62a = vmaxq_u8(even32_62a,even32_62b);
+    odd1_31a   = vmaxq_u8(odd1_31a,odd1_31b);
+    odd33_63a  = vmaxq_u8(odd33_63a,odd33_63b);
+
+    // Traceback information
+    TBeven0_30  = vceqq_u8(even0_30a,even0_30b);
+    TBeven32_62 = vceqq_u8(even32_62a,even32_62b);
+    TBodd1_31   = vceqq_u8(odd1_31a,odd1_31b);
+    TBodd33_63  = vceqq_u8(odd33_63a,odd33_63b);
 
-    /*
-    print_bytes(metrics0_15,"metrics0_15");
-    print_bytes(metrics16_31,"metrics16_31");
-    print_bytes(metrics32_47,"metrics32_47");
-    print_bytes(metrics48_63,"metrics48_63");
-    */
+    metrics0_31  = vzipq_u8(even0_30a,odd1_31a);
+    metrics32_63 = vzipq_u8(even32_62a,odd33_63a);
 
+    TB_ptr[0] = vzipq_u8(TBeven0_30,TBodd1_31);
+    TB_ptr[1] = vzipq_u8(TBeven32_62,TBodd33_63);
 
+    in+=2;
+    TB_ptr += 2;
+
+    // rescale by subtracting minimum
+    /****************************************************
+    USE SSSE instruction phminpos!!!!!!!
+    ****************************************************/
+    min_state =vminq_u8(metrics0_31.val[0],metrics0_31.val[1]);
+    min_state =vminq_u8(min_state,metrics32_63.val[0]);
+    min_state =vminq_u8(min_state,metrics32_63.val[1]);
+    // here we have 16 maximum metrics from the 64 states
+    uint8x8_t min_state2 = vpmin_u8(((uint8x8_t*)&min_state)[0],((uint8x8_t*)&min_state)[0]);
+    // now the 8 maximum in min_state2
+    min_state2 = vpmin_u8(min_state2,min_state2);
+    // now the 4 maximum in min_state2, repeated twice
+    min_state2 = vpmin_u8(min_state2,min_state2);
+    // now the 2 maximum in min_state2, repeated 4 times
+    min_state2 = vpmin_u8(min_state2,min_state2);
+    // now the 1 maximum in min_state2, repeated 8 times
+    min_state  = vcombine_u8(min_state2,min_state2);
+    // now the 1 maximum in min_state, repeated 16 times
+    metrics0_31.val[0]  = vqsubq_u8(metrics0_31.val[0],min_state);
+    metrics0_31.val[1]  = vqsubq_u8(metrics0_31.val[1],min_state);
+    metrics32_63.val[0] = vqsubq_u8(metrics32_63.val[0],min_state);
+    metrics32_63.val[1] = vqsubq_u8(metrics32_63.val[1],min_state);
 
+#endif
   }
 
   // Traceback
@@ -429,29 +412,10 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
     }
   }
 
+#if defined(__x86_64) || defined(__i386__)
   _mm_empty();
-
-}
-
-#else //EXPRESSMIMO_TARGET
-
-void phy_viterbi_dot11(char *y,unsigned char *decoded_bytes,unsigned short n)
-{
-}
-
-#endif //EXPRESSMIMO_TARGET
-
-/*
-void print_bytes(__m128i x,char *s) {
-
-  unsigned char *tempb = (unsigned char *)&x;
-
-  printf("%s  : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",s,
-   tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7],
-   tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]);
-
+#endif
 }
-*/
 
 #ifdef TEST_DEBUG
 #include <stdio.h>
diff --git a/openair1/PHY/CODING/viterbi_lte.c b/openair1/PHY/CODING/viterbi_lte.c
index 019cf0fc5c..230b933f79 100644
--- a/openair1/PHY/CODING/viterbi_lte.c
+++ b/openair1/PHY/CODING/viterbi_lte.c
@@ -49,21 +49,14 @@
 #define msg printf
 #endif
 
-#ifndef EXPRESSMIMO_TARGET
+
 #include "PHY/sse_intrin.h"
-#endif //EXPRESSMIMO_TARGET
 
 extern uint8_t ccodelte_table[128],ccodelte_table_rev[128];
 
 
 
 
-#ifdef __KERNEL__
-#define printf rt_printk
-#endif
-
-#ifndef EXPRESSMIMO_TARGET
-
 static int8_t m0_table[64*16*16*16] __attribute__ ((aligned(16)));
 static int8_t m1_table[64*16*16*16] __attribute__ ((aligned(16)));
 
@@ -143,20 +136,33 @@ void print_shorts(__m128i x,char *s) {
 #endif // USER_MODE
 
 
-static __m128i  TB[4*8192];
-
-static __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,
-       TBodd33_63;// __attribute__((aligned(16)));
-
-static __m128i min_state,min_state2;// __attribute__((aligned(16)));
 
 void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
 {
 
 
-  static __m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[0];
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i  TB[4*8192];
+  __m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[0];
+  
+  __m128i metrics0_15,metrics16_31,metrics32_47,metrics48_63,even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,
+    TBodd33_63;
+  
+  __m128i min_state,min_state2;
 
+#elif defined(__arm__)
+  uint8x16x2_t TB[2*8192];  // 2 int8x16_t per input bit, 8 bits / byte, 8192 is largest packet size in bits
 
+  uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
+  uint8x16x2_t metrics0_31,metrics32_63;
+
+  uint8x16_t min_state;
+
+  uint8x16_t *m0_ptr,*m1_ptr;
+  uint8x16x2_t *TB_ptr = &TB[0];
+
+
+#endif
   int8_t *in = y;
   uint8_t prev_state0,maxm,s;
   static uint8_t *TB_ptr2;
@@ -167,140 +173,70 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
   // set initial metrics
   //debug_msg("Doing viterbi\n");
 
-  metrics0_15 = _mm_setzero_si128();
+#if defined(__x86_64__) || defined(__i386__)
+
+  metrics0_15  = _mm_setzero_si128();
   metrics16_31 = _mm_setzero_si128();
   metrics32_47 = _mm_setzero_si128();
   metrics48_63 = _mm_setzero_si128();
-#ifndef USER_MODE
-  //debug_msg("Doing viterbi 2\n");
+#elif defined(__arm__)
+    metrics0_31.val[0]  = vdupq_n_u8(0); 
+    metrics0_31.val[1]  = vdupq_n_u8(0);
+    metrics32_63.val[0] = vdupq_n_u8(0);
+    metrics32_63.val[1] = vdupq_n_u8(0);
 #endif
-  /*
-  print_bytes(metrics0_15,"metrics0_15");
-  print_bytes(metrics16_31,"metrics16_31");
-  print_bytes(metrics32_47,"metrics32_47");
-  print_bytes(metrics48_63,"metrics48_63");
-  */
 
   for (iter=0; iter<2; iter++) {
     in = y;
     TB_ptr=&TB[0];
 
-    //    printf("Iteration %d\n",iter);
     for (position=0; position<n; position++) {
 
 
-      //      printf("%d/%d : (%d,%d,%d)\n",position,n-1,in[0],in[1],in[2]);
-
-
       // get branch metric offsets for the 64 states
       table_offset = (in[0]+8 + ((in[1]+8)<<4) + ((in[2]+8)<<8))<<6;
-      /*
-      printf("Table_offset = %u (in[0]=%d,in[1]=%d,in[2]=%d)\n",table_offset,in[0],in[1],in[2]);
-      print_bytes("m0",&m0_table[table_offset]);
-      print_bytes("m1",&m1_table[table_offset]);
-      */
+
+#if defined(__x86_64__) || defined(__i386__)
       m0_ptr = (__m128i *)&m0_table[table_offset];
       m1_ptr = (__m128i *)&m1_table[table_offset];
 
-      //    printf("\n");
-
       // even states
       even0_30a  = _mm_adds_epu8(metrics0_15,m0_ptr[0]);
-      //    print_bytes(even0_30a,"even0_30a");
-
       even32_62a = _mm_adds_epu8(metrics16_31,m0_ptr[1]);
-      //    print_bytes(even32_62a,"even32_62a");
-
       even0_30b  = _mm_adds_epu8(metrics32_47,m0_ptr[2]);
-      //    print_bytes(even0_30b,"even0_30b");
-
       even32_62b = _mm_adds_epu8(metrics48_63,m0_ptr[3]);
-      //    print_bytes(even32_62b,"even32_62b");
 
-      //    printf("\n");
 
       // odd states
       odd1_31a   = _mm_adds_epu8(metrics0_15,m1_ptr[0]);
-
-      //    print_bytes(odd1_31a,"odd1_31a");
-
       odd33_63a  = _mm_adds_epu8(metrics16_31,m1_ptr[1]);
-
-      //    print_bytes(odd33_63a,"odd33_63a");
-
       odd1_31b   = _mm_adds_epu8(metrics32_47,m1_ptr[2]);
-
-      //    print_bytes(odd1_31b,"odd1_31b");
-
       odd33_63b  = _mm_adds_epu8(metrics48_63,m1_ptr[3]);
 
-      //    print_bytes(odd33_63b,"odd33_63b");
-
-
-
-
       // select maxima
-      //    printf("\n");
 
       even0_30a  = _mm_max_epu8(even0_30a,even0_30b);
-
-      //    print_bytes(even0_30a,"even0_30a");
-
       even32_62a = _mm_max_epu8(even32_62a,even32_62b);
-
-      //    print_bytes(even32_62a,"even32_62a");
-
       odd1_31a   = _mm_max_epu8(odd1_31a,odd1_31b);
-
-      //    print_bytes(odd1_31a,"odd1_31a");
-
       odd33_63a  = _mm_max_epu8(odd33_63a,odd33_63b);
 
-      //    print_bytes(odd33_63a,"odd33_63a");
-
-      //    printf("\n");
       // Traceback information
 
       TBeven0_30  = _mm_cmpeq_epi8(even0_30a,even0_30b);
-
-
       TBeven32_62 = _mm_cmpeq_epi8(even32_62a,even32_62b);
-
-
       TBodd1_31   = _mm_cmpeq_epi8(odd1_31a,odd1_31b);
-
-
       TBodd33_63  = _mm_cmpeq_epi8(odd33_63a,odd33_63b);
 
-
       metrics0_15        = _mm_unpacklo_epi8(even0_30a ,odd1_31a);
       metrics16_31       = _mm_unpackhi_epi8(even0_30a ,odd1_31a);
       metrics32_47       = _mm_unpacklo_epi8(even32_62a,odd33_63a);
       metrics48_63       = _mm_unpackhi_epi8(even32_62a,odd33_63a);
 
-      /*
-      print_bytes(metrics0_15,"metrics0_15");
-      print_bytes(metrics16_31,"metrics16_31");
-      print_bytes(metrics32_47,"metrics32_47");
-      print_bytes(metrics48_63,"metrics48_63");
-      */
-
-
       TB_ptr[0]  = _mm_unpacklo_epi8(TBeven0_30,TBodd1_31);
-
-      //    print_bytes(TB_ptr[0],"TB0_15");
-
       TB_ptr[1] = _mm_unpackhi_epi8(TBeven0_30,TBodd1_31);
-
-      //    print_bytes(TB_ptr[1],"TB16_31");
-
       TB_ptr[2] = _mm_unpacklo_epi8(TBeven32_62,TBodd33_63);
-
-      //    print_bytes(TB_ptr[2],"TB32_47");
-
       TB_ptr[3] = _mm_unpackhi_epi8(TBeven32_62,TBodd33_63);
 
-      //    print_bytes(TB_ptr[3],"TB48_63");
 
       in+=3;
       TB_ptr += 4;
@@ -313,50 +249,90 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
       min_state =_mm_min_epu8(min_state,metrics32_47);
       min_state =_mm_min_epu8(min_state,metrics48_63);
 
-      //    print_bytes(min_state,"min_state");
-
       min_state2 = min_state;
       min_state  = _mm_unpacklo_epi8(min_state,min_state);
       min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
       min_state  = _mm_min_epu8(min_state,min_state2);
 
-      //    print_bytes(min_state,"min_state");
-
       min_state2 = min_state;
       min_state  = _mm_unpacklo_epi8(min_state,min_state);
       min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
       min_state  = _mm_min_epu8(min_state,min_state2);
 
-      //    print_bytes(min_state,"min_state");
-
       min_state2 = min_state;
       min_state  = _mm_unpacklo_epi8(min_state,min_state);
       min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
       min_state  = _mm_min_epu8(min_state,min_state2);
 
-      //    print_bytes(min_state,"min_state");
-
       min_state2 = min_state;
       min_state  = _mm_unpacklo_epi8(min_state,min_state);
       min_state2 = _mm_unpackhi_epi8(min_state2,min_state2);
       min_state  = _mm_min_epu8(min_state,min_state2);
 
-      //    print_bytes(min_state,"min_state");
-
       metrics0_15  = _mm_subs_epu8(metrics0_15,min_state);
       metrics16_31 = _mm_subs_epu8(metrics16_31,min_state);
       metrics32_47 = _mm_subs_epu8(metrics32_47,min_state);
       metrics48_63 = _mm_subs_epu8(metrics48_63,min_state);
-
-      /*
-      print_bytes("metrics0_15",&metrics0_15);
-      print_bytes("metrics16_31",&metrics16_31);
-      print_bytes("metrics32_47",&metrics32_47);
-      print_bytes("metrics48_63",&metrics48_63);
-
-      printf("\n");
-      */
-
+#elif defined(__arm__)
+    m0_ptr = (uint8x16_t *)&m0_table[table_offset];
+    m1_ptr = (uint8x16_t *)&m1_table[table_offset];
+
+
+    // even states
+    even0_30a  = vqaddq_u8(metrics0_31.val[0],m0_ptr[0]);
+    even32_62a = vqaddq_u8(metrics0_31.val[1],m0_ptr[1]);
+    even0_30b  = vqaddq_u8(metrics32_63.val[0],m0_ptr[2]);
+    even32_62b = vqaddq_u8(metrics32_63.val[1],m0_ptr[3]);
+
+    // odd states
+    odd1_31a   = vqaddq_u8(metrics0_31.val[0],m1_ptr[0]);
+    odd33_63a  = vqaddq_u8(metrics0_31.val[1],m1_ptr[1]);
+    odd1_31b   = vqaddq_u8(metrics32_63.val[0],m1_ptr[2]);
+    odd33_63b  = vqaddq_u8(metrics32_63.val[1],m1_ptr[3]);
+    // select maxima
+    even0_30a  = vmaxq_u8(even0_30a,even0_30b);
+    even32_62a = vmaxq_u8(even32_62a,even32_62b);
+    odd1_31a   = vmaxq_u8(odd1_31a,odd1_31b);
+    odd33_63a  = vmaxq_u8(odd33_63a,odd33_63b);
+
+    // Traceback information
+    TBeven0_30  = vceqq_u8(even0_30a,even0_30b);
+    TBeven32_62 = vceqq_u8(even32_62a,even32_62b);
+    TBodd1_31   = vceqq_u8(odd1_31a,odd1_31b);
+    TBodd33_63  = vceqq_u8(odd33_63a,odd33_63b);
+
+    metrics0_31  = vzipq_u8(even0_30a,odd1_31a);
+    metrics32_63 = vzipq_u8(even32_62a,odd33_63a);
+
+    TB_ptr[0] = vzipq_u8(TBeven0_30,TBodd1_31);
+    TB_ptr[1] = vzipq_u8(TBeven32_62,TBodd33_63);
+
+    in+=2;
+    TB_ptr += 2;
+
+    // rescale by subtracting minimum
+    /****************************************************
+    USE SSSE instruction phminpos!!!!!!!
+    ****************************************************/
+    min_state =vminq_u8(metrics0_31.val[0],metrics0_31.val[1]);
+    min_state =vminq_u8(min_state,metrics32_63.val[0]);
+    min_state =vminq_u8(min_state,metrics32_63.val[1]);
+    // here we have 16 maximum metrics from the 64 states
+    uint8x8_t min_state2 = vpmin_u8(((uint8x8_t*)&min_state)[0],((uint8x8_t*)&min_state)[0]);
+    // now the 8 maximum in min_state2
+    min_state2 = vpmin_u8(min_state2,min_state2);
+    // now the 4 maximum in min_state2, repeated twice
+    min_state2 = vpmin_u8(min_state2,min_state2);
+    // now the 2 maximum in min_state2, repeated 4 times
+    min_state2 = vpmin_u8(min_state2,min_state2);
+    // now the 1 maximum in min_state2, repeated 8 times
+    min_state  = vcombine_u8(min_state2,min_state2);
+    // now the 1 maximum in min_state, repeated 16 times
+    metrics0_31.val[0]  = vqsubq_u8(metrics0_31.val[0],min_state);
+    metrics0_31.val[1]  = vqsubq_u8(metrics0_31.val[1],min_state);
+    metrics32_63.val[0] = vqsubq_u8(metrics32_63.val[0],min_state);
+    metrics32_63.val[1] = vqsubq_u8(metrics32_63.val[1],min_state);
+#endif
     }
 
   } // iteration
@@ -365,6 +341,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
   prev_state0 = 0;
   maxm = 0;
 
+#if defined(__x86_64__) || defined(__i386__)
   for (s=0; s<16; s++)
     if (((uint8_t *)&metrics0_15)[s] > maxm) {
       maxm = ((uint8_t *)&metrics0_15)[s];
@@ -389,17 +366,39 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
       prev_state0 = s+48;
     }
 
-  //  printf("Max state %d\n",prev_state0);
+
+#elif defined(__arm__)
+  for (s=0; s<16; s++)
+    if (((uint8_t *)&metrics0_31.val[0])[s] > maxm) {
+      maxm = ((uint8_t *)&metrics0_31.val[0])[s];
+      prev_state0 = s;
+    }
+
+  for (s=0; s<16; s++)
+    if (((uint8_t *)&metrics0_31.val[1])[s] > maxm) {
+      maxm = ((uint8_t *)&metrics0_31.val[1])[s];
+      prev_state0 = s+16;
+    }
+
+  for (s=0; s<16; s++)
+    if (((uint8_t *)&metrics32_63.val[0])[s] > maxm) {
+      maxm = ((uint8_t *)&metrics32_63.val[0])[s];
+      prev_state0 = s+32;
+    }
+
+  for (s=0; s<16; s++)
+    if (((uint8_t *)&metrics32_63.val[1])[s] > maxm) {
+      maxm = ((uint8_t *)&metrics32_63.val[1])[s];
+      prev_state0 = s+48;
+    }
+#endif
+
   TB_ptr2 = (uint8_t *)&TB[(n-1)*4];
 
   for (position = n-1 ; position>-1; position--) {
 
-    //    if ((position%8) == 0)
-    //  printf("%d: %x\n",1+(position>>3),decoded_bytes[1+(position>>3)]);
-
     decoded_bytes[(position)>>3] += (prev_state0 & 0x1)<<(7-(position & 0x7));
 
-    //    printf("pos %d : ps = %d -> %d\n",position,prev_state0,TB_ptr2[prev_state0]);
 
     if (TB_ptr2[prev_state0] == 0)
       prev_state0 = (prev_state0 >> 1);
@@ -409,31 +408,12 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
     TB_ptr2-=64;
   }
 
-  //  printf("Max state %d\n",prev_state0);
+
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
-}
-
-#else //EXPRESSMIMO_TARGET
-
-void phy_viterbi_lte(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
-{
-}
-
-#endif //EXPRESSMIMO_TARGET
-
-/*
-void print_bytes(__m128i x,int8_t *s) {
-
-  uint8_t *tempb = (uint8_t *)&x;
-
-  printf("%s  : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",s,
-   tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7],
-   tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]);
-
+#endif
 }
-*/
 
 #ifdef TEST_DEBUG
 int test_viterbi(uint8_t dabflag)
diff --git a/openair1/PHY/LTE_ESTIMATION/filt96_32.h b/openair1/PHY/LTE_ESTIMATION/filt96_32.h
index 36eac7ea30..c92be225bd 100644
--- a/openair1/PHY/LTE_ESTIMATION/filt96_32.h
+++ b/openair1/PHY/LTE_ESTIMATION/filt96_32.h
@@ -26,187 +26,187 @@
   Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
 
  *******************************************************************************/
-short filt24_0[24] = {
+short filt24_0[24] __attribute__((aligned(16))) ={
   2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_0_dcl[24] = {
+short filt24_0_dcl[24] __attribute__((aligned(16))) ={
   2341,4681,7022,9362,11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_0_dcr[24] = {
+short filt24_0_dcr[24] __attribute__((aligned(16))) ={
   2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0
 };
 
-short filt24_1[24] = {
+short filt24_1[24] __attribute__((aligned(16))) ={
   0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_1_dcl[24] = {
+short filt24_1_dcl[24] __attribute__((aligned(16))) ={
   0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_1_dcr[24] = {
+short filt24_1_dcr[24] __attribute__((aligned(16))) ={
   0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0
 };
 
 
-short filt24_2[24] = {
+short filt24_2[24] __attribute__((aligned(16))) ={
   0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_2_dcl[24] = {
+short filt24_2_dcl[24] __attribute__((aligned(16))) ={
   0,0,2341,4681,7022,9362, 11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_2_dcr[24] = {
+short filt24_2_dcr[24] __attribute__((aligned(16))) ={
   0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,4681,2341,0,0,0,0,0,0,0,0,0,0,0
 };
 
 //  X X X Y | X X X X | X Y X X
-short filt24_3[24] = {
+short filt24_3[24] __attribute__((aligned(16))) ={
   0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_3_dcl[24] = {
+short filt24_3_dcl[24] __attribute__((aligned(16))) ={
   0,0,0,2341,4681,7022,9362,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
 };
 //  X X X Y | X X DC X X | X Y X X
-short filt24_3_dcr[24] = {
+short filt24_3_dcr[24] __attribute__((aligned(16))) ={
   0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0,0
 };
 
 
-short filt24_4[24] = {
+short filt24_4[24] __attribute__((aligned(16))) ={
   0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
 };
-short filt24_4_dcl[24] = {
+short filt24_4_dcl[24] __attribute__((aligned(16))) ={
   0,0,0,0,2341,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
 };
-short filt24_4_dcr[24] = {
+short filt24_4_dcr[24] __attribute__((aligned(16))) ={
   0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0
 };
 
-short filt24_5[24] = {
+short filt24_5[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
 };
 //  X X X Y | X X DC X X | X Y X X
-short filt24_5_dcl[24] = {
+short filt24_5_dcl[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,2341,4681,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
 };
-short filt24_5_dcr[24] = {
+short filt24_5_dcr[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,2730,5461,8192,10922,13653,16384,11703,9362,7022,4681,2730,0,0,0,0,0,0,0,0
 };
 
 
-short filt24_6[24] = {
+short filt24_6[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
 };
-short filt24_6_dcl[24] = {
+short filt24_6_dcl[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
 };
-short filt24_6_dcr[24] = {
+short filt24_6_dcr[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0
 };
 
 
-short filt24_7[24] = {
+short filt24_7[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
 };
-short filt24_7_dcl[24] = {
+short filt24_7_dcl[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
 };
-short filt24_7_dcr[24] = {
+short filt24_7_dcr[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0
 };
 
-short filt24_0l[24] = {
+short filt24_0l[24] __attribute__((aligned(16))) ={
   30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_1l[24] = {
+short filt24_1l[24] __attribute__((aligned(16))) ={
   0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_2l[24] = {
+short filt24_2l[24] __attribute__((aligned(16))) ={
   0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_3l[24] = {
+short filt24_3l[24] __attribute__((aligned(16))) ={
   //0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0};
   0,0,0,0,0,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_4l[24] = {
+short filt24_4l[24] __attribute__((aligned(16))) ={
   0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
 };
-short filt24_5l[24] = {
+short filt24_5l[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
 };
-short filt24_6l[24] = {
+short filt24_6l[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
 };
-short filt24_7l[24] = {
+short filt24_7l[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
 };
-short filt24_0l2[24] = {
+short filt24_0l2[24] __attribute__((aligned(16))) ={
   2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_1l2[24] = {
+short filt24_1l2[24] __attribute__((aligned(16))) ={
   0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_2l2[24] = {
+short filt24_2l2[24] __attribute__((aligned(16))) ={
   -2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_3l2[24] = {
+short filt24_3l2[24] __attribute__((aligned(16))) ={
   -5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_4l2[24] = {
+short filt24_4l2[24] __attribute__((aligned(16))) ={
   -8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
 };
-short filt24_5l2[24] = {
+short filt24_5l2[24] __attribute__((aligned(16))) ={
   0,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
 };
-short filt24_6l2[24] = {
+short filt24_6l2[24] __attribute__((aligned(16))) ={
   -13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
 };
-short filt24_7l2[24] = {
+short filt24_7l2[24] __attribute__((aligned(16))) ={
   0,-13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
 };
-short filt24_0r[24] = {
+short filt24_0r[24] __attribute__((aligned(16))) ={
   2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_1r[24] = {
+short filt24_1r[24] __attribute__((aligned(16))) ={
   0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_2r[24] = {
+short filt24_2r[24] __attribute__((aligned(16))) ={
   0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_3r[24] = {
+short filt24_3r[24] __attribute__((aligned(16))) ={
   0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0
 };
-short filt24_4r[24] = {
+short filt24_4r[24] __attribute__((aligned(16))) ={
   0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0
 };
-short filt24_5r[24] = {
+short filt24_5r[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0
 };
-short filt24_6r[24] = {
+short filt24_6r[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0
 };
-short filt24_7r[24] = {
+short filt24_7r[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0
 };
-short filt24_0r2[24] = {  /****/
+short filt24_0r2[24] __attribute__((aligned(16))) ={  /****/
   2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0,0
 };
-short filt24_1r2[24] = {
+short filt24_1r2[24] __attribute__((aligned(16))) ={
   0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0
 };
-short filt24_2r2[24] = {
+short filt24_2r2[24] __attribute__((aligned(16))) ={
   0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0
 };
-short filt24_3r2[24] = {
+short filt24_3r2[24] __attribute__((aligned(16))) ={
   0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0
 };
-short filt24_4r2[24] = {
+short filt24_4r2[24] __attribute__((aligned(16))) ={
   0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0
 };
-short filt24_5r2[24] = {
+short filt24_5r2[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0
 };
-short filt24_6r2[24] = {
+short filt24_6r2[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0
 };
-short filt24_7r2[24] = {
+short filt24_7r2[24] __attribute__((aligned(16))) ={
   0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653
 };
diff --git a/openair1/PHY/LTE_ESTIMATION/freq_equalization.c b/openair1/PHY/LTE_ESTIMATION/freq_equalization.c
index 6e71e23ec9..595d1e7a48 100755
--- a/openair1/PHY/LTE_ESTIMATION/freq_equalization.c
+++ b/openair1/PHY/LTE_ESTIMATION/freq_equalization.c
@@ -299,11 +299,17 @@ void freq_equalization(LTE_DL_FRAME_PARMS *frame_parms,
 {
   uint16_t re;
   int16_t amp;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *ul_ch_mag128,*ul_ch_magb128,*rxdataF_comp128;
-
   rxdataF_comp128   = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12];
   ul_ch_mag128      = (__m128i *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12];
   ul_ch_magb128      = (__m128i *)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12];
+#elif defined(__arm__)
+  int16x8_t *ul_ch_mag128,*ul_ch_magb128,*rxdataF_comp128;
+  rxdataF_comp128   = (int16x8_t*)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12];
+  ul_ch_mag128      = (int16x8_t*)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12];
+  ul_ch_magb128     = (int16x8_t*)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12];
+#endif
 
   for (re=0; re<(Msc_RS>>2); re++) {
 
@@ -313,15 +319,25 @@ void freq_equalization(LTE_DL_FRAME_PARMS *frame_parms,
       amp=255;
 
     //     printf("freq_eq: symbol %d re %d => %d,%d,%d, (%d) (%d,%d) => ",symbol,re,*((int16_t*)(&ul_ch_mag128[re])),amp,inv_ch[8*amp],*((int16_t*)(&ul_ch_mag128[re]))*inv_ch[8*amp],*(int16_t*)&(rxdataF_comp128[re]),*(1+(int16_t*)&(rxdataF_comp128[re])));
+#if defined(__x86_64__) || defined(__i386__)
     rxdataF_comp128[re] = _mm_mullo_epi16(rxdataF_comp128[re],*((__m128i *)&inv_ch[8*amp]));
 
     if (Qm==4)
-      ul_ch_mag128[re] = _mm_set1_epi16(324);  // this is 512*2/sqrt(10)
+      ul_ch_mag128[re]  = _mm_set1_epi16(324);  // this is 512*2/sqrt(10)
     else {
-      ul_ch_mag128[re]   = _mm_set1_epi16(316);  // this is 512*4/sqrt(42)
+      ul_ch_mag128[re]  = _mm_set1_epi16(316);  // this is 512*4/sqrt(42)
       ul_ch_magb128[re] = _mm_set1_epi16(158);  // this is 512*2/sqrt(42)
     }
+#elif defined(__arm__)
+    rxdataF_comp128[re] = vmulq_s16(rxdataF_comp128[re],*((int16x8_t *)&inv_ch[8*amp]));
 
+    if (Qm==4)
+      ul_ch_mag128[re]  = vdupq_n_s16(324);  // this is 512*2/sqrt(10)
+    else {
+      ul_ch_mag128[re]  = vdupq_n_s16(316);  // this is 512*4/sqrt(42)
+      ul_ch_magb128[re] = vdupq_n_s16(158);  // this is 512*2/sqrt(42)
+    }
+#endif
     //            printf("(%d,%d)\n",*(int16_t*)&(rxdataF_comp128[re]),*(1+(int16_t*)&(rxdataF_comp128[re])));
 
   }
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c
index 27e9018477..6e853e6c58 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c
@@ -49,7 +49,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
   unsigned char nu,aarx;
   unsigned short k;
   unsigned int rb,pilot_cnt;
-  short ch[2],*pil,*rxF,*dl_ch,*dl_ch_prev,*f,*f2,*fl,*f2l2,*fr,*f2r2,*f2_dc,*f_dc;
+  int16_t ch[2],*pil,*rxF,*dl_ch,*dl_ch_prev,*f,*f2,*fl,*f2l2,*fr,*f2r2,*f2_dc,*f_dc;
   int ch_offset,symbol_offset;
   //  unsigned int n;
   //  int i;
@@ -192,14 +192,13 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
 
   for (aarx=0; aarx<phy_vars_ue->lte_frame_parms.nb_antennas_rx; aarx++) {
 
-    pil   = (short *)&pilot[p][0];
-    rxF   = (short *)&rxdataF[aarx][((symbol_offset+k+phy_vars_ue->lte_frame_parms.first_carrier_offset))];
-    dl_ch = (short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset];
+    pil   = (int16_t *)&pilot[p][0];
+    rxF   = (int16_t *)&rxdataF[aarx][((symbol_offset+k+phy_vars_ue->lte_frame_parms.first_carrier_offset))];
+    dl_ch = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset];
 
 
     //    if (eNb_id==0)
     memset(dl_ch,0,4*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size));
-
     if (phy_vars_ue->high_speed_flag==0) // multiply previous channel estimate by ch_est_alpha
       multadd_complex_vector_real_scalar(dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1),
                                          phy_vars_ue->ch_est_alpha,dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1),
@@ -212,8 +211,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
 
       //First half of pilots
       // Treat first 2 pilots specially (left edge)
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
       // printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
       multadd_real_vector_complex_scalar(fl,
                                          ch,
@@ -223,8 +222,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
       rxF+=12;
       dl_ch+=8;
 
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
       // printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
       multadd_real_vector_complex_scalar(f2l2,
                                          ch,
@@ -236,15 +235,15 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
 
       for (pilot_cnt=2; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-1); pilot_cnt+=2) {
 
-        // printf("%d\n",dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
+        // printf("%d\n",dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
 
         //  printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]);
         //  printf("rx[%d] -> (%d,%d)\n", k, rxF[0], rxF[1]);
 
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15); //Re
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15); //Im
-        // printf("**rb %d %d\n",rb,dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); //Re
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); //Im
+        // printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
         multadd_real_vector_complex_scalar(f,
                                            ch,
                                            dl_ch,
@@ -259,9 +258,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         // printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]);
 
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
-        // printf("**rb %d %d\n",rb,dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
+        // printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
         multadd_real_vector_complex_scalar(f2,
                                            ch,
                                            dl_ch,
@@ -280,17 +279,17 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
       if (k > 6)
         k -=6;
 
-      rxF   = (short *)&rxdataF[aarx][((symbol_offset+1+k))];
+      rxF   = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))];
 
       for (pilot_cnt=0; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-3); pilot_cnt+=2) {
         //  printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]);
         //  printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]);
 
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
-        //   printf("**rb %d %d\n",rb,dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
+        //   printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
         multadd_real_vector_complex_scalar(f,
                                            ch,
                                            dl_ch,
@@ -299,10 +298,10 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         rxF+=12;
         dl_ch+=8;
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
-        //   printf("**rb %d %d\n",rb,dl_ch-(short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
+        //   printf("**rb %d %d\n",rb,dl_ch-(int16_T *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
         multadd_real_vector_complex_scalar(f2,
                                            ch,
                                            dl_ch,
@@ -313,8 +312,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
 
       }
 
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
       //            printf("pilot 49: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
 
       multadd_real_vector_complex_scalar(fr,
@@ -325,8 +324,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
       rxF+=12;
       dl_ch+=8;
 
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
       //             printf("pilot 50: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
       multadd_real_vector_complex_scalar(f2r2,
                                          ch,
@@ -340,8 +339,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
       //printf("Channel estimation\n");
 
       // Treat first 2 pilots specially (left edge)
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
 #ifdef DEBUG_CH
       printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
@@ -358,8 +357,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
       rxF+=12;
       dl_ch+=8;
 
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
 #ifdef DEBUG_CH
       printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
@@ -381,8 +380,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         // printf("pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]);
         // printf("rx[%d][%d] -> (%d,%d)\n",p,phy_vars_ue->lte_frame_parms.first_carrier_offset + phy_vars_ue->lte_frame_parms.nushift + 6*rb+(3*p),rxF[0],rxF[1]);
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
 #ifdef DEBUG_CH
         printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
@@ -400,8 +399,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         rxF+=12;
         dl_ch+=8;
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
 #ifdef DEBUG_CH
         printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
@@ -419,8 +418,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
 
       }
 
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 #ifdef DEBUG_CH
       printf("pilot 24: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
 
@@ -438,10 +437,10 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
 
       // printf("Second half\n");
       // Second half of RBs
-      rxF   = (short *)&rxdataF[aarx][((symbol_offset+1+k))];
+      rxF   = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))];
 
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 #ifdef DEBUG_CH
       printf("pilot 25: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
 
@@ -462,8 +461,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         // printf("* pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]);
         // printf("rx[%d][%d] -> (%d,%d)\n",p,phy_vars_ue->lte_frame_parms.first_carrier_offset + phy_vars_ue->lte_frame_parms.nushift + 6*rb+(3*p),rxF[0],rxF[1]);
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 #ifdef DEBUG_CH
         printf("pilot %d rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",26+pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
 
@@ -479,8 +478,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         rxF+=12;
         dl_ch+=8;
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 #ifdef DEBUG_CH
         printf("pilot %d : rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",27+pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
 
@@ -498,8 +497,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
 
       }
 
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
 #ifdef DEBUG_CH
       printf("pilot 49: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
@@ -517,8 +516,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
       rxF+=12;
       dl_ch+=8;
 
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
 #ifdef DEBUG_CH
 
@@ -544,8 +543,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         //       phy_vars_ue->lte_frame_parms.first_carrier_offset + phy_vars_ue->lte_frame_parms.nushift + 6*rb+(3*p),
         //       rxF[0],
         //       rxF[1]);
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
         //printf("ch -> (%d,%d)\n",ch[0],ch[1]);
         multadd_real_vector_complex_scalar(f,
                                            ch,
@@ -555,8 +554,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         rxF+=12;
         dl_ch+=8;
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
         //printf("ch -> (%d,%d)\n",ch[0],ch[1]);
         multadd_real_vector_complex_scalar(f2,
                                            ch,
@@ -568,8 +567,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
 
       }
 
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
       //     printf("ch -> (%d,%d)\n",ch[0],ch[1]);
       multadd_real_vector_complex_scalar(f,
                                          ch,
@@ -580,9 +579,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
 
       //printf("Second half\n");
       //Second half of RBs
-      rxF   = (short *)&rxdataF[aarx][((symbol_offset+1+nushift + (3*p)))];
-      ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-      ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+      rxF   = (int16_t *)&rxdataF[aarx][((symbol_offset+1+nushift + (3*p)))];
+      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
       multadd_real_vector_complex_scalar(f2,
                                          ch,
@@ -599,8 +598,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         //       phy_vars_ue->lte_frame_parms.first_carrier_offset + phy_vars_ue->lte_frame_parms.nushift + 6*rb+(3*p),
         //       rxF[0],
         //       rxF[1]);
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
         multadd_real_vector_complex_scalar(f,
                                            ch,
@@ -610,8 +609,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
         rxF+=12;
         dl_ch+=8;
 
-        ch[0] = (short)(((int)pil[0]*rxF[0] - (int)pil[1]*rxF[1])>>15);
-        ch[1] = (short)(((int)pil[0]*rxF[1] + (int)pil[1]*rxF[0])>>15);
+        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
 
         multadd_real_vector_complex_scalar(f2,
                                            ch,
@@ -631,7 +630,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
       // Temporal Interpolation
       // printf("ch_offset %d\n",ch_offset);
 
-      dl_ch = (short *)&dl_ch_estimates[(p<<1)+aarx][ch_offset];
+      dl_ch = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset];
 
       if (phy_vars_ue->high_speed_flag == 0) {
         multadd_complex_vector_real_scalar(dl_ch,
@@ -640,8 +639,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
       } else { // high_speed_flag == 1
         if (symbol == 0) {
           //      printf("Interpolating %d->0\n",4-phy_vars_ue->lte_frame_parms.Ncp);
-          //      dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][(4-phy_vars_ue->lte_frame_parms.Ncp)*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)];
-          dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][pilot3*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)];
+          //      dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][(4-phy_vars_ue->lte_frame_parms.Ncp)*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)];
+          dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][pilot3*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)];
 
           multadd_complex_vector_real_scalar(dl_ch_prev,21845,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
           multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
@@ -650,7 +649,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
           multadd_complex_vector_real_scalar(dl_ch,21845,dl_ch_prev+(2*((phy_vars_ue->lte_frame_parms.ofdm_symbol_size)<<1)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
         } // this is 1/3,2/3 combination for pilots spaced by 3 symbols
         else if (symbol == pilot1) {
-          dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][0];
+          dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][0];
 
           if (phy_vars_ue->lte_frame_parms.Ncp==0) {// pilot spacing 4 symbols (1/4,1/2,3/4 combination)
             multadd_complex_vector_real_scalar(dl_ch_prev,24576,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
@@ -669,7 +668,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
             multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*((phy_vars_ue->lte_frame_parms.ofdm_symbol_size)<<1)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
           } // pilot spacing 3 symbols (1/3,2/3 combination)
         } else if (symbol == pilot2) {
-          dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][pilot1*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)];
+          dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][pilot1*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)];
 
           multadd_complex_vector_real_scalar(dl_ch_prev,21845,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
           multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
@@ -678,7 +677,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
           multadd_complex_vector_real_scalar(dl_ch,21845,dl_ch_prev+(2*((phy_vars_ue->lte_frame_parms.ofdm_symbol_size)<<1)),0,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
         } else { // symbol == pilot3
           //      printf("Interpolating 0->%d\n",4-phy_vars_ue->lte_frame_parms.Ncp);
-          dl_ch_prev = (short *)&dl_ch_estimates[(p<<1)+aarx][pilot2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)];
+          dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][pilot2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)];
 
           if (phy_vars_ue->lte_frame_parms.Ncp==0) {// pilot spacing 4 symbols (1/4,1/2,3/4 combination)
             multadd_complex_vector_real_scalar(dl_ch_prev,24576,dl_ch_prev+(2*(phy_vars_ue->lte_frame_parms.ofdm_symbol_size)),1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c b/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c
index 77e5be8500..b11ba532f5 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c
@@ -35,7 +35,11 @@
 #include "PHY/defs.h"
 //#define DEBUG_PHY
 
+#if defined(__x86_64__) || defined(__i386__)
 __m128i avg128F;
+#elif defined(__arm__)
+int32x4_t avg128F;
+#endif
 
 //compute average channel_level on each (TX,RX) antenna pair
 int dl_channel_level(int16_t *dl_ch,
@@ -43,10 +47,15 @@ int dl_channel_level(int16_t *dl_ch,
 {
 
   int16_t rb;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *dl_ch128;
+#elif defined(__arm__)
+  int16x4_t *dl_ch128;
+#endif
   int avg;
 
   //clear average level
+#if defined(__x86_64__) || defined(__i386__)
   avg128F = _mm_setzero_si128();
   dl_ch128=(__m128i *)dl_ch;
 
@@ -59,7 +68,25 @@ int dl_channel_level(int16_t *dl_ch,
     dl_ch128+=3;
 
   }
+#elif defined(__arm__)
+  avg128F = vdupq_n_s32(0);
+  dl_ch128=(int16x4_t *)dl_ch;
 
+  for (rb=0; rb<frame_parms->N_RB_DL; rb++) {
+
+       avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[0],dl_ch128[0]));
+       avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[1],dl_ch128[1]));
+       avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[2],dl_ch128[2]));
+       avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[3],dl_ch128[3]));
+       avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[4],dl_ch128[4]));
+       avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[5],dl_ch128[5]));
+       dl_ch128+=6;
+
+
+  }
+
+
+#endif
   DevAssert( frame_parms->N_RB_DL );
   avg = (((int*)&avg128F)[0] +
          ((int*)&avg128F)[1] +
@@ -67,10 +94,10 @@ int dl_channel_level(int16_t *dl_ch,
          ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12);
 
 
-
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
   return(avg);
 }
 
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c b/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c
index 6a5a6eb3c1..37f5d26315 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_sync_timefreq.c
@@ -46,14 +46,15 @@
 //#include "defs.h"
 #include "PHY/defs.h"
 #include "PHY/extern.h"
-#include "pss6144.h"
-
 
+#if defined(__x86_64__) || defined(__i386__)
+#include "pss6144.h"
 extern void print_shorts(char*,__m128i*);
+#endif
 
 void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq)
 {
-
+#if defined(__x86_64__) || defined(__i386__)
   UE_SCAN_INFO_t *scan_info = &ue->scan_info[band];
   int16_t spectrum[12288] __attribute__((aligned(16)));
   int16_t spectrum_p5ms[12288] __attribute__((aligned(16)));
@@ -358,5 +359,6 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq)
   for (band_idx=0; band_idx<10; band_idx++)
     printf("pss 2: level %d dB, freq %u\n", dB_fixed(scan_info->amp[2][band_idx]),scan_info->freq_offset_Hz[2][band_idx]);
 
+#endif
 }
 
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c b/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c
index dbd385dbdc..dcef22dc99 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_ue_measurements.c
@@ -42,34 +42,26 @@
 //#define DEBUG_MEAS
 
 #ifdef USER_MODE
-void print_shorts(char *s,__m128i *x)
+void print_shorts(char *s,short *x)
 {
 
-  short *tempb = (short *)x;
 
   printf("%s  : %d,%d,%d,%d,%d,%d,%d,%d\n",s,
-         tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7]
+         x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7]
         );
 
 }
-void print_ints(char *s,__m128i *x)
+void print_ints(char *s,int *x)
 {
 
-  int *tempb = (int *)x;
 
   printf("%s  : %d,%d,%d,%d\n",s,
-         tempb[0],tempb[1],tempb[2],tempb[3]
+         x[0],x[1],x[2],x[3]
         );
 
 }
 #endif
 
-__m128i pmi128_re __attribute__ ((aligned(16)));
-__m128i pmi128_im __attribute__ ((aligned(16)));
-__m128i mmtmpPMI0 __attribute__ ((aligned(16)));
-__m128i mmtmpPMI1 __attribute__ ((aligned(16)));
-__m128i mmtmpPMI2 __attribute__ ((aligned(16)));
-__m128i mmtmpPMI3 __attribute__ ((aligned(16)));
 
 int16_t get_PL(uint8_t Mod_id,uint8_t CC_id,uint8_t eNB_index)
 {
@@ -421,7 +413,11 @@ void lte_ue_measurements(PHY_VARS_UE *phy_vars_ue,
   //int rx_power[NUMBER_OF_CONNECTED_eNB_MAX];
   int i;
   unsigned int limit,subband;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *dl_ch0_128,*dl_ch1_128;
+#elif defined(__arm__)
+  int16x8_t *dl_ch0_128, *dl_ch1_128;
+#endif
   int *dl_ch0,*dl_ch1;
   LTE_DL_FRAME_PARMS *frame_parms = &phy_vars_ue->lte_frame_parms;
   int nb_subbands,subband_size,last_subband_size;
@@ -605,26 +601,30 @@ void lte_ue_measurements(PHY_VARS_UE *phy_vars_ue,
 
       for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
         // skip the first 4 RE due to interpolation filter length of 5 (not possible to skip 5 due to 128i alignment, must be multiple of 128bit)
+
+#if defined(__x86_64__) || defined(__i386__)
+       __m128i pmi128_re,pmi128_im,mmtmpPMI0,mmtmpPMI1,mmtmpPMI2,mmtmpPMI3;
+
         dl_ch0_128    = (__m128i *)&phy_vars_ue->lte_ue_common_vars.dl_ch_estimates[eNB_id][aarx][4];
         dl_ch1_128    = (__m128i *)&phy_vars_ue->lte_ue_common_vars.dl_ch_estimates[eNB_id][2+aarx][4];
+#elif defined(__arm__)
+        int32x4_t pmi128_re,pmi128_im,mmtmpPMI0,mmtmpPMI1,mmtmpPMI0b,mmtmpPMI1b;
 
-        /*
-          #ifdef DEBUG_PHY
-          if(eNB_id==0){
-          print_shorts("Ch0",dl_ch0_128);
-          print_shorts("Ch1",dl_ch1_128);
-          printf("eNB_ID = %d\n",eNB_id);
-          }
-          #endif
-        */
+        dl_ch0_128    = (int16x8_t *)&phy_vars_ue->lte_ue_common_vars.dl_ch_estimates[eNB_id][aarx][4];
+        dl_ch1_128    = (int16x8_t *)&phy_vars_ue->lte_ue_common_vars.dl_ch_estimates[eNB_id][2+aarx][4];
+
+#endif
         for (subband=0; subband<nb_subbands; subband++) {
 
 
           // pmi
-
+#if defined(__x86_64__) || defined(__i386__)
           pmi128_re = _mm_setzero_si128();
           pmi128_im = _mm_setzero_si128();
-
+#elif defined(__arm__)
+          pmi128_re = vdupq_n_s32(0);
+	  pmi128_im = vdupq_n_s32(0);
+#endif
           // limit is the number of groups of 4 REs in a subband (12 = 4 RBs, 3 = 1 RB)
           // for 5 MHz channelization, there are 7 subbands, 6 of size 4 RBs and 1 of size 1 RB
           if ((N_RB_DL==6) || (subband<(nb_subbands-1)))
@@ -636,52 +636,33 @@ void lte_ue_measurements(PHY_VARS_UE *phy_vars_ue,
 
             // For each RE in subband perform ch0 * conj(ch1)
             // multiply by conjugated channel
-            // if(eNB_id==0){
-            //print_shorts("ch0",dl_ch0_128);
-            //print_shorts("ch1",dl_ch1_128);
-            // }
-            // if(i==0){
-            mmtmpPMI0 = _mm_setzero_si128();
-            mmtmpPMI1 = _mm_setzero_si128();
-            //      }
-            // if(eNB_id==0)
-            // print_ints("Pre_re",&mmtmpPMI0);
-
-            mmtmpPMI0 = _mm_madd_epi16(dl_ch0_128[0],dl_ch1_128[0]);
-            //  if(eNB_id==0)
-            //  print_ints("re",&mmtmpPMI0);
-
-            // mmtmpPMI0 contains real part of 4 consecutive outputs (32-bit)
-            // print_shorts("Ch1",dl_ch1_128);
-
+#if defined(__x86_64__) || defined(__i386__)
             mmtmpPMI1 = _mm_shufflelo_epi16(dl_ch1_128[0],_MM_SHUFFLE(2,3,0,1));//_MM_SHUFFLE(2,3,0,1)
-            // print_shorts("mmtmpPMI1:",&mmtmpPMI1);
             mmtmpPMI1 = _mm_shufflehi_epi16(mmtmpPMI1,_MM_SHUFFLE(2,3,0,1));
-            // print_shorts("mmtmpPMI1:",&mmtmpPMI1);
-
             mmtmpPMI1 = _mm_sign_epi16(mmtmpPMI1,*(__m128i*)&conjugate[0]);
-            // print_shorts("mmtmpPMI1:",&mmtmpPMI1);
             mmtmpPMI1 = _mm_madd_epi16(mmtmpPMI1,dl_ch0_128[0]);
-            //  if(eNB_id==0)
-            //  print_ints("im",&mmtmpPMI1);
             // mmtmpPMI1 contains imag part of 4 consecutive outputs (32-bit)
 
             pmi128_re = _mm_add_epi32(pmi128_re,mmtmpPMI0);
             pmi128_im = _mm_add_epi32(pmi128_im,mmtmpPMI1);
+#elif defined(__arm__)
+            mmtmpPMI0 = vmull_s16(((int16x4_t*)dl_ch0_128)[0], ((int16x4_t*)dl_ch1_128)[0]);
+            mmtmpPMI1 = vmull_s16(((int16x4_t*)dl_ch0_128)[1], ((int16x4_t*)dl_ch1_128)[1]);
+            pmi128_re = vqaddq_s32(pmi128_re,vcombine_s32(vpadd_s32(vget_low_s32(mmtmpPMI0),vget_high_s32(mmtmpPMI0)),vpadd_s32(vget_low_s32(mmtmpPMI1),vget_high_s32(mmtmpPMI1))));
+
+            mmtmpPMI0b = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)dl_ch0_128)[0],*(int16x4_t*)conjugate)), ((int16x4_t*)dl_ch1_128)[0]);
+            mmtmpPMI1b = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)dl_ch0_128)[1],*(int16x4_t*)conjugate)), ((int16x4_t*)dl_ch1_128)[1]);
+            pmi128_im = vqaddq_s32(pmi128_im,vcombine_s32(vpadd_s32(vget_low_s32(mmtmpPMI0b),vget_high_s32(mmtmpPMI0b)),vpadd_s32(vget_low_s32(mmtmpPMI1b),vget_high_s32(mmtmpPMI1b))));
+
+#endif
             dl_ch0_128++;
             dl_ch1_128++;
           }
 
           phy_vars_ue->PHY_measurements.subband_pmi_re[eNB_id][subband][aarx] = (((int *)&pmi128_re)[0] + ((int *)&pmi128_re)[1] + ((int *)&pmi128_re)[2] + ((int *)&pmi128_re)[3])>>2;
-          //    if(eNB_id==0)
-          // printf("in lte_ue_measurements.c: pmi_re %d\n",phy_vars_ue->PHY_measurements.subband_pmi_re[eNB_id][subband][aarx]);
           phy_vars_ue->PHY_measurements.subband_pmi_im[eNB_id][subband][aarx] = (((int *)&pmi128_im)[0] + ((int *)&pmi128_im)[1] + ((int *)&pmi128_im)[2] + ((int *)&pmi128_im)[3])>>2;
-          //    if(eNB_id==0)
-          // printf("in lte_ue_measurements.c: pmi_im %d\n",phy_vars_ue->PHY_measurements.subband_pmi_im[eNB_id][subband][aarx]);
           phy_vars_ue->PHY_measurements.wideband_pmi_re[eNB_id][aarx] += phy_vars_ue->PHY_measurements.subband_pmi_re[eNB_id][subband][aarx];
           phy_vars_ue->PHY_measurements.wideband_pmi_im[eNB_id][aarx] += phy_vars_ue->PHY_measurements.subband_pmi_im[eNB_id][subband][aarx];
-          //      msg("subband_pmi[%d][%d][%d] => (%d,%d)\n",eNB_id,subband,aarx,phy_vars_ue->PHY_measurements.subband_pmi_re[eNB_id][subband][aarx],phy_vars_ue->PHY_measurements.subband_pmi_im[eNB_id][subband][aarx]);
-
         } // subband loop
       } // rx antenna loop
     }  // if frame_parms->mode1_flag == 0
@@ -742,9 +723,10 @@ void lte_ue_measurements(PHY_VARS_UE *phy_vars_ue,
     // printf("in lte_ue_measurements: selected rx_antenna[eNB_id==0]:%u\n", phy_vars_ue->PHY_measurements.selected_rx_antennas[eNB_id][i]);
   }  // eNB_id loop
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
index a569ddfc71..9a47a6b355 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
@@ -106,9 +106,13 @@ int32_t lte_ul_channel_estimation(PHY_VARS_eNB *phy_vars_eNB,
             *temp_out_fft_1_ptr = (int32_t*)0,*out_fft_ptr_1 = (int32_t*)0,
              *temp_in_ifft_ptr = (int32_t*)0;
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxdataF128,*ul_ref128,*ul_ch128;
   __m128i mmtmpU0,mmtmpU1,mmtmpU2,mmtmpU3;
-
+#elif defined(__arm__)
+  int16x8_t *rxdataF128,*ul_ref128,*ul_ch128;
+  int32x4_t mmtmp0,mmtmp1,mmtmp_re,mmtmp_im;
+#endif
   Msc_RS = N_rb_alloc*12;
 
   cyclic_shift = (frame_parms->pusch_config_common.ul_ReferenceSignalsPUSCH.cyclicShift +
@@ -156,11 +160,18 @@ int32_t lte_ul_channel_estimation(PHY_VARS_eNB *phy_vars_eNB,
     for (aa=0; aa<nb_antennas_rx; aa++) {
       //           msg("Componentwise prod aa %d, symbol_offset %d,ul_ch_estimates %p,ul_ch_estimates[aa] %p,ul_ref_sigs_rx[0][0][Msc_RS_idx] %p\n",aa,symbol_offset,ul_ch_estimates,ul_ch_estimates[aa],ul_ref_sigs_rx[0][0][Msc_RS_idx]);
 
+#if defined(__x86_64__) || defined(__i386__)
       rxdataF128 = (__m128i *)&rxdataF_ext[aa][symbol_offset];
       ul_ch128   = (__m128i *)&ul_ch_estimates[aa][symbol_offset];
       ul_ref128  = (__m128i *)ul_ref_sigs_rx[u][v][Msc_RS_idx];
+#elif defined(__arm__)
+      rxdataF128 = (int16x8_t *)&rxdataF_ext[aa][symbol_offset];
+      ul_ch128   = (int16x8_t *)&ul_ch_estimates[aa][symbol_offset];
+      ul_ref128  = (int16x8_t *)ul_ref_sigs_rx[u][v][Msc_RS_idx];
+#endif
 
       for (i=0; i<Msc_RS/12; i++) {
+#if defined(__x86_64__) || defined(__i386__)
         // multiply by conjugated channel
         mmtmpU0 = _mm_madd_epi16(ul_ref128[0],rxdataF128[0]);
         // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
@@ -204,7 +215,50 @@ int32_t lte_ul_channel_estimation(PHY_VARS_eNB *phy_vars_eNB,
         mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
 
         ul_ch128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
+#elif defined(__arm__)
+      mmtmp0 = vmull_s16(((int16x4_t*)ul_ref128)[0],((int16x4_t*)rxdataF128)[0]);
+      mmtmp1 = vmull_s16(((int16x4_t*)ul_ref128)[1],((int16x4_t*)rxdataF128)[1]);
+      mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+      mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[0],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[0]);
+      mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[1],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[1]);
+      mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+
+      ul_ch128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
+      ul_ch128++;
+      ul_ref128++;
+      rxdataF128++;
+      mmtmp0 = vmull_s16(((int16x4_t*)ul_ref128)[0],((int16x4_t*)rxdataF128)[0]);
+      mmtmp1 = vmull_s16(((int16x4_t*)ul_ref128)[1],((int16x4_t*)rxdataF128)[1]);
+      mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+      mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[0],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[0]);
+      mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[1],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[1]);
+      mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+
+      ul_ch128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
+      ul_ch128++;
+      ul_ref128++;
+      rxdataF128++;
+
+      mmtmp0 = vmull_s16(((int16x4_t*)ul_ref128)[0],((int16x4_t*)rxdataF128)[0]);
+      mmtmp1 = vmull_s16(((int16x4_t*)ul_ref128)[1],((int16x4_t*)rxdataF128)[1]);
+      mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+      mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[0],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[0]);
+      mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)ul_ref128)[1],*(int16x4_t*)conjugate)), ((int16x4_t*)rxdataF128)[1]);
+      mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+
+      ul_ch128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
+      ul_ch128++;
+      ul_ref128++;
+      rxdataF128++;
+
 
+#endif
         ul_ch128+=3;
         ul_ref128+=3;
         rxdataF128+=3;
@@ -538,17 +592,17 @@ int32_t lte_ul_channel_estimation(PHY_VARS_eNB *phy_vars_eNB,
             //          msg("sym: %d, current_phase1: %d, ru: %d + j%d, current_phase2: %d, ru: %d + j%d\n",k,current_phase1,ru1[2*current_phase1],ru1[2*current_phase1+1],current_phase2,ru2[2*current_phase2],ru2[2*current_phase2+1]);
 
             // rotate channel estimates by estimated phase
-            rotate_cpx_vector_norep((int16_t*) ul_ch1,
-                                    &ru1[2*current_phase1],
-                                    (int16_t*) &ul_ch_estimates[aa][frame_parms->N_RB_UL*12*k],
-                                    Msc_RS,
-                                    15);
-
-            rotate_cpx_vector_norep((int16_t*) ul_ch2,
-                                    &ru2[2*current_phase2],
-                                    (int16_t*) &tmp_estimates[0],
-                                    Msc_RS,
-                                    15);
+            rotate_cpx_vector((int16_t*) ul_ch1,
+                              &ru1[2*current_phase1],
+                              (int16_t*) &ul_ch_estimates[aa][frame_parms->N_RB_UL*12*k],
+                              Msc_RS,
+                              15);
+
+            rotate_cpx_vector((int16_t*) ul_ch2,
+                              &ru2[2*current_phase2],
+                              (int16_t*) &tmp_estimates[0],
+                              Msc_RS,
+                              15);
 
             // Combine the two rotated estimates
             multadd_complex_vector_real_scalar((int16_t*) &ul_ch_estimates[aa][frame_parms->N_RB_UL*12*k],SCALE,(int16_t*) &ul_ch_estimates[aa][frame_parms->N_RB_UL*12*k],1,Msc_RS);
@@ -664,11 +718,11 @@ int32_t lte_srs_channel_estimation(LTE_DL_FRAME_PARMS *frame_parms,
       //write_output("eNb_rxF.m","rxF",&eNb_common_vars->rxdataF[0][aa][2*frame_parms->ofdm_symbol_size*symbol],2*(frame_parms->ofdm_symbol_size),2,1);
       //write_output("eNb_srs.m","srs_eNb",eNb_common_vars->srs,(frame_parms->ofdm_symbol_size),1,1);
 
-      mult_cpx_vector_norep((int16_t*) &eNb_common_vars->rxdataF[eNb_id][aa][2*frame_parms->ofdm_symbol_size*symbol],
-                            (int16_t*) eNb_srs_vars->srs,
-                            (int16_t*) eNb_srs_vars->srs_ch_estimates[eNb_id][aa],
-                            frame_parms->ofdm_symbol_size,
-                            15);
+      mult_cpx_conj_vector((int16_t*) &eNb_common_vars->rxdataF[eNb_id][aa][2*frame_parms->ofdm_symbol_size*symbol],
+                      (int16_t*) eNb_srs_vars->srs,
+                      (int16_t*) eNb_srs_vars->srs_ch_estimates[eNb_id][aa],
+                      frame_parms->ofdm_symbol_size,
+                      15);
 
       //msg("SRS channel estimation cmult out\n");
 #ifdef USER_MODE
@@ -695,6 +749,7 @@ int16_t lte_ul_freq_offset_estimation(LTE_DL_FRAME_PARMS *frame_parms,
                                       uint16_t nb_rb)
 {
 
+#if defined(__x86_64__) || defined(__i386__)
   int k, rb;
   int a_idx = 64;
   uint8_t conj_flag = 0;
@@ -830,4 +885,7 @@ int16_t lte_ul_freq_offset_estimation(LTE_DL_FRAME_PARMS *frame_parms,
     phase_idx = -phase_idx;
 
   return(phase_idx);
+#elif defined(__arm__)
+  return(0);
+#endif
 }
diff --git a/openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c b/openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c
index 4704e76b23..1a09c83372 100644
--- a/openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c
+++ b/openair1/PHY/LTE_REFSIG/lte_dl_cell_spec.c
@@ -94,9 +94,9 @@ int lte_dl_cell_spec_SS(PHY_VARS_eNB *phy_vars_eNB,
     output[k] = qpsk[(phy_vars_eNB->lte_gold_table[Ns][l][mprime_dword]>>(2*mprime_qpsk_symb))&3];
     //output[k] = (lte_gold_table[eNB_offset][Ns][l][mprime_dword]>>(2*mprime_qpsk_symb))&3;
 #ifdef DEBUG_DL_CELL_SPEC
-    debug_msg("Ns %d, l %d, m %d,mprime_dword %d, mprime_qpsk_symbol %d\n",
+    msg("Ns %d, l %d, m %d,mprime_dword %d, mprime_qpsk_symbol %d\n",
               Ns,l,m,mprime_dword,mprime_qpsk_symb);
-    debug_msg("index = %d (k %d)\n",(phy_vars_eNB->lte_gold_table[Ns][l][mprime_dword]>>(2*mprime_qpsk_symb))&3,k);
+    msg("index = %d (k %d)\n",(phy_vars_eNB->lte_gold_table[Ns][l][mprime_dword]>>(2*mprime_qpsk_symb))&3,k);
 #endif
 
     mprime++;
diff --git a/openair1/PHY/LTE_TRANSPORT/dci.c b/openair1/PHY/LTE_TRANSPORT/dci.c
index f46e43a58a..2e485e66fa 100644
--- a/openair1/PHY/LTE_TRANSPORT/dci.c
+++ b/openair1/PHY/LTE_TRANSPORT/dci.c
@@ -560,10 +560,10 @@ int32_t pdcch_qpsk_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
                             uint8_t symbol)
 {
 
-  __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
-  __m128i *rxF_i=(__m128i*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)];
-  __m128i *rho=(__m128i*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)];
-  __m128i *llr128;
+  int16_t *rxF=(int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16_t *rxF_i=(int16_t*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16_t *rho=(int16_t*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16_t *llr128;
   int32_t i;
   char *pdcch_llr8;
   int16_t *pdcch_llr;
@@ -572,17 +572,17 @@ int32_t pdcch_qpsk_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
 
   //  printf("dlsch_qpsk_qpsk: symbol %d\n",symbol);
 
-  llr128 = (__m128i*)pdcch_llr;
+  llr128 = (int16_t*)pdcch_llr;
 
   if (!llr128) {
     msg("dlsch_qpsk_qpsk_llr: llr is null, symbol %d\n",symbol);
     return -1;
   }
 
-  qpsk_qpsk((int16_t *)rxF,
-            (int16_t *)rxF_i,
-            (int16_t *)llr128,
-            (int16_t *)rho,
+  qpsk_qpsk(rxF,
+            rxF_i,
+            llr128,
+            rho,
             frame_parms->N_RB_DL*12);
 
   //prepare for Viterbi which accepts 8 bit, but prefers 4 bit, soft input.
@@ -639,7 +639,7 @@ int32_t pdcch_llr(LTE_DL_FRAME_PARMS *frame_parms,
 
 }
 
-__m128i avg128P;
+//__m128i avg128P;
 
 //compute average channel_level on each (TX,RX) antenna pair
 void pdcch_channel_level(int32_t **dl_ch_estimates_ext,
@@ -650,21 +650,31 @@ void pdcch_channel_level(int32_t **dl_ch_estimates_ext,
 
   int16_t rb;
   uint8_t aatx,aarx;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *dl_ch128;
-
-
+  __m128i avg128P;
+#elif defined(__arm__)
+  int16x8_t *dl_ch128;
+  int32x4_t *avg128P;
+#endif
   for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++)
     for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
       //clear average level
+#if defined(__x86_64__) || defined(__i386__)
       avg128P = _mm_setzero_si128();
       dl_ch128=(__m128i *)&dl_ch_estimates_ext[(aatx<<1)+aarx][frame_parms->N_RB_DL*12];
+#elif defined(__arm__)
 
+#endif
       for (rb=0; rb<nb_rb; rb++) {
 
+#if defined(__x86_64__) || defined(__i386__)
         avg128P = _mm_add_epi32(avg128P,_mm_madd_epi16(dl_ch128[0],dl_ch128[0]));
         avg128P = _mm_add_epi32(avg128P,_mm_madd_epi16(dl_ch128[1],dl_ch128[1]));
         avg128P = _mm_add_epi32(avg128P,_mm_madd_epi16(dl_ch128[2],dl_ch128[2]));
+#elif defined(__arm__)
 
+#endif
         dl_ch128+=3;
         /*
           if (rb==0) {
@@ -684,13 +694,18 @@ void pdcch_channel_level(int32_t **dl_ch_estimates_ext,
       //            msg("Channel level : %d\n",avg[(aatx<<1)+aarx]);
     }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 
 }
 
+#if defined(__x86_64) || defined(__i386__)
 __m128i mmtmpPD0,mmtmpPD1,mmtmpPD2,mmtmpPD3;
+#elif defined(__arm__)
 
+#endif
 void pdcch_dual_stream_correlation(LTE_DL_FRAME_PARMS *frame_parms,
                                    uint8_t symbol,
                                    int32_t **dl_ch_estimates_ext,
@@ -700,7 +715,11 @@ void pdcch_dual_stream_correlation(LTE_DL_FRAME_PARMS *frame_parms,
 {
 
   uint16_t rb;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *dl_ch128,*dl_ch128i,*dl_ch_rho128;
+#elif defined(__arm__)
+
+#endif
   uint8_t aarx;
 
   //  printf("dlsch_dual_stream_correlation: symbol %d\n",symbol);
@@ -708,13 +727,18 @@ void pdcch_dual_stream_correlation(LTE_DL_FRAME_PARMS *frame_parms,
 
   for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
 
+#if defined(__x86_64__) || defined(__i386__)
     dl_ch128          = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
     dl_ch128i         = (__m128i *)&dl_ch_estimates_ext_i[aarx][symbol*frame_parms->N_RB_DL*12];
     dl_ch_rho128      = (__m128i *)&dl_ch_rho_ext[aarx][symbol*frame_parms->N_RB_DL*12];
 
+#elif defined(__arm__)
+
+#endif
 
     for (rb=0; rb<frame_parms->N_RB_DL; rb++) {
       // multiply by conjugated channel
+#if defined(__x86_64__) || defined(__i386__)
       mmtmpPD0 = _mm_madd_epi16(dl_ch128[0],dl_ch128i[0]);
       //  print_ints("re",&mmtmpPD0);
 
@@ -779,13 +803,16 @@ void pdcch_dual_stream_correlation(LTE_DL_FRAME_PARMS *frame_parms,
       dl_ch128i+=3;
       dl_ch_rho128+=3;
 
-    }
 
-  }
+#elif defined(__arm__)
 
+#endif
+     }
+  }
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 
 }
 
@@ -800,44 +827,78 @@ void pdcch_detection_mrc_i(LTE_DL_FRAME_PARMS *frame_parms,
 
   uint8_t aatx;
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxdataF_comp128_0,*rxdataF_comp128_1,*rxdataF_comp128_i0,*rxdataF_comp128_i1,*rho128_0,*rho128_1,*rho128_i0,*rho128_i1;
+#elif defined(__arm__)
+  int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1,*rxdataF_comp128_i0,*rxdataF_comp128_i1,*rho128_0,*rho128_1,*rho128_i0,*rho128_i1;
+#endif
   int32_t i;
 
   if (frame_parms->nb_antennas_rx>1) {
     for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) {
       //if (frame_parms->mode1_flag && (aatx>0)) break;
 
+#if defined(__x86_64__) || defined(__i386__)
       rxdataF_comp128_0   = (__m128i *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12];
       rxdataF_comp128_1   = (__m128i *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12];
-
+#elif defined(__arm__)
+      rxdataF_comp128_0   = (int16x8_t *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12];
+      rxdataF_comp128_1   = (int16x8_t *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12];
+#endif
       // MRC on each re of rb on MF output
       for (i=0; i<frame_parms->N_RB_DL*3; i++) {
+#if defined(__x86_64__) || defined(__i386__)
         rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1));
+#elif defined(__arm__)
+        rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]);
+#endif
       }
     }
 
+#if defined(__x86_64__) || defined(__i386__)
     rho128_0 = (__m128i *) &rho[0][symbol*frame_parms->N_RB_DL*12];
     rho128_1 = (__m128i *) &rho[1][symbol*frame_parms->N_RB_DL*12];
-
+#elif defined(__arm__)
+    rho128_0 = (int16x8_t *) &rho[0][symbol*frame_parms->N_RB_DL*12];
+    rho128_1 = (int16x8_t *) &rho[1][symbol*frame_parms->N_RB_DL*12];
+#endif
     for (i=0; i<frame_parms->N_RB_DL*3; i++) {
+#if defined(__x86_64__) || defined(__i386__)
       rho128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rho128_0[i],1),_mm_srai_epi16(rho128_1[i],1));
+#elif defined(__arm__)
+      rho128_0[i] = vhaddq_s16(rho128_0[i],rho128_1[i]);
+#endif
     }
 
+#if defined(__x86_64__) || defined(__i386__)
     rho128_i0 = (__m128i *) &rho_i[0][symbol*frame_parms->N_RB_DL*12];
     rho128_i1 = (__m128i *) &rho_i[1][symbol*frame_parms->N_RB_DL*12];
     rxdataF_comp128_i0   = (__m128i *)&rxdataF_comp_i[0][symbol*frame_parms->N_RB_DL*12];
     rxdataF_comp128_i1   = (__m128i *)&rxdataF_comp_i[1][symbol*frame_parms->N_RB_DL*12];
+#elif defined(__arm__)
+    rho128_i0 = (int16x8_t*) &rho_i[0][symbol*frame_parms->N_RB_DL*12];
+    rho128_i1 = (int16x8_t*) &rho_i[1][symbol*frame_parms->N_RB_DL*12];
+    rxdataF_comp128_i0   = (int16x8_t *)&rxdataF_comp_i[0][symbol*frame_parms->N_RB_DL*12];
+    rxdataF_comp128_i1   = (int16x8_t *)&rxdataF_comp_i[1][symbol*frame_parms->N_RB_DL*12];
 
+#endif
     // MRC on each re of rb on MF and rho
     for (i=0; i<frame_parms->N_RB_DL*3; i++) {
+#if defined(__x86_64__) || defined(__i386__)
       rxdataF_comp128_i0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_i0[i],1),_mm_srai_epi16(rxdataF_comp128_i1[i],1));
       rho128_i0[i]          = _mm_adds_epi16(_mm_srai_epi16(rho128_i0[i],1),_mm_srai_epi16(rho128_i1[i],1));
+#elif defined(__arm__)
+      rxdataF_comp128_i0[i] = vhaddq_s16(rxdataF_comp128_i0[i],rxdataF_comp128_i1[i]);
+      rho128_i0[i]          = vhaddq_s16(rho128_i0[i],rho128_i1[i]);
+
+#endif
     }
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 
@@ -1056,10 +1117,6 @@ void pdcch_extract_rbs_single(int32_t **rxdataF,
       }
     }
   }
-
-  _mm_empty();
-  _m_empty();
-
 }
 
 void pdcch_extract_rbs_dual(int32_t **rxdataF,
@@ -1310,11 +1367,6 @@ void pdcch_extract_rbs_dual(int32_t **rxdataF,
       }
     }
   }
-
-  _mm_empty();
-  _m_empty();
-
-
 }
 
 
@@ -1328,8 +1380,12 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext,
 {
 
   uint16_t rb;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *dl_ch128,*rxdataF128,*rxdataF_comp128;
   __m128i *dl_ch128_2, *rho128;
+#elif defined(__arm__)
+
+#endif
   uint8_t aatx,aarx,pilots=0;
 
 
@@ -1347,13 +1403,18 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext,
 
     for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
 
+#if defined(__x86_64__) || defined(__i386__)
       dl_ch128          = (__m128i *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12];
       rxdataF128        = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
       rxdataF_comp128   = (__m128i *)&rxdataF_comp[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12];
 
+#elif defined(__arm__)
+
+#endif
 
       for (rb=0; rb<frame_parms->N_RB_DL; rb++) {
 
+#if defined(__x86_64__) || defined(__i386__)
         // multiply by conjugated channel
         mmtmpPD0 = _mm_madd_epi16(dl_ch128[0],rxdataF128[0]);
         //  print_ints("re",&mmtmpPD0);
@@ -1426,6 +1487,9 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext,
           rxdataF128+=2;
           rxdataF_comp128+=2;
         }
+#elif defined(__arm__)
+
+#endif
       }
     }
   }
@@ -1434,11 +1498,18 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext,
   if (rho) {
 
     for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
+
+#if defined(__x86_64__) || defined(__i386__)
       rho128        = (__m128i *)&rho[aarx][symbol*frame_parms->N_RB_DL*12];
       dl_ch128      = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
       dl_ch128_2    = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
 
+#elif defined(__arm__)
+      
+#endif
       for (rb=0; rb<frame_parms->N_RB_DL; rb++) {
+#if defined(__x86_64__) || defined(__i386__)
+
         // multiply by conjugated channel
         mmtmpPD0 = _mm_madd_epi16(dl_ch128[0],dl_ch128_2[0]);
         //  print_ints("re",&mmtmpD0);
@@ -1504,14 +1575,19 @@ void pdcch_channel_compensation(int32_t **rxdataF_ext,
         dl_ch128_2+=3;
         rho128+=3;
 
+#elif defined(__arm_)
+
+
+#endif
       }
     }
 
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 void pdcch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
@@ -1521,23 +1597,37 @@ void pdcch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
 
   uint8_t aatx;
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxdataF_comp128_0,*rxdataF_comp128_1;
+#elif defined(__arm__)
+ int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1;
+#endif
   int32_t i;
 
   if (frame_parms->nb_antennas_rx>1) {
     for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) {
+#if defined(__x86_64__) || defined(__i386__)
       rxdataF_comp128_0   = (__m128i *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12];
       rxdataF_comp128_1   = (__m128i *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12];
-
+#elif defined(__arm__)
+      rxdataF_comp128_0   = (int16x8_t *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12];
+      rxdataF_comp128_1   = (int16x8_t *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12];
+#endif
       // MRC on each re of rb
       for (i=0; i<frame_parms->N_RB_DL*3; i++) {
+#if defined(__x86_64__) || defined(__i386__)
         rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1));
+#elif defined(__arm__)
+        rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]);
+#endif
       }
     }
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 
 }
 
@@ -1593,8 +1683,6 @@ void pdcch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,
     }
   }
 
-  _mm_empty();
-  _m_empty();
 
 }
 
@@ -2008,7 +2096,7 @@ uint8_t generate_dci_top(uint8_t num_ue_spec_dci,
   //memset(e, 2, DCI_BITS_MAX);
   // here we interpret NIL as a random QPSK sequence. That makes power estimation easier.
   for (i=0; i<DCI_BITS_MAX; i++)
-    e[i]=taus()&1;
+    e[i]=0;//taus()&1;
 
   e_ptr = e;
 
diff --git a/openair1/PHY/LTE_TRANSPORT/defs.h b/openair1/PHY/LTE_TRANSPORT/defs.h
index 847beb5b09..444a2b1262 100644
--- a/openair1/PHY/LTE_TRANSPORT/defs.h
+++ b/openair1/PHY/LTE_TRANSPORT/defs.h
@@ -139,7 +139,7 @@ typedef struct {
   /// Concatenated "e"-sequences (for definition see 36-212 V8.6 2009-03, p.17-18)
   uint8_t e[MAX_NUM_CHANNEL_BITS];
   /// Turbo-code outputs (36-212 V8.6 2009-03, p.12
-  uint8_t d[MAX_NUM_DLSCH_SEGMENTS][(96+3+(3*6144))];
+  uint8_t *d[MAX_NUM_DLSCH_SEGMENTS];//[(96+3+(3*6144))];
   /// Sub-block interleaver outputs (36-212 V8.6 2009-03, p.16-17)
   uint8_t w[MAX_NUM_DLSCH_SEGMENTS][3*6144];
   /// Number of code segments (for definition see 36-212 V8.6 2009-03, p.9)
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c b/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c
index 5dd2f9b5f9..1a93020447 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c
@@ -100,6 +100,10 @@ void free_eNB_dlsch(LTE_eNB_DLSCH_t *dlsch)
             free16(dlsch->harq_processes[i]->c[r],((r==0)?8:0) + 3+768);
             dlsch->harq_processes[i]->c[r] = NULL;
           }
+          if (dlsch->harq_processes[i]->d[r]) {
+            free16(dlsch->harq_processes[i]->d[r],(96+3+(3*6144)));
+            dlsch->harq_processes[i]->d[r] = NULL;
+          }
         }
 
         free16(dlsch->harq_processes[i],sizeof(LTE_DL_eNB_HARQ_t));
@@ -168,14 +172,20 @@ LTE_eNB_DLSCH_t *new_eNB_dlsch(unsigned char Kmimo,unsigned char Mdlharq,unsigne
         if (abstraction_flag==0) {
           for (r=0; r<MAX_NUM_DLSCH_SEGMENTS/bw_scaling; r++) {
             // account for filler in first segment and CRCs for multiple segment case
-            dlsch->harq_processes[i]->c[r] = (unsigned char*)malloc16(((r==0)?8:0) + 3+ 768);
-
+            dlsch->harq_processes[i]->c[r] = (uint8_t*)malloc16(((r==0)?8:0) + 3+ 768);
+            dlsch->harq_processes[i]->d[r] = (uint8_t*)malloc16((96+3+(3*6144)));
             if (dlsch->harq_processes[i]->c[r]) {
               bzero(dlsch->harq_processes[i]->c[r],((r==0)?8:0) + 3+ 768);
             } else {
               msg("Can't get c\n");
               exit_flag=2;
             }
+            if (dlsch->harq_processes[i]->d[r]) {
+              bzero(dlsch->harq_processes[i]->d[r],(96+3+(3*6144)));
+            } else {
+              msg("Can't get d\n");
+              exit_flag=2;
+            }
           }
         }
       } else {
@@ -190,8 +200,10 @@ LTE_eNB_DLSCH_t *new_eNB_dlsch(unsigned char Kmimo,unsigned char Mdlharq,unsigne
 
         if (abstraction_flag==0) {
           for (j=0; j<96; j++)
-            for (r=0; r<MAX_NUM_DLSCH_SEGMENTS; r++)
+            for (r=0; r<MAX_NUM_DLSCH_SEGMENTS/bw_scaling; r++) {
+        //      printf("dlsch->harq_processes[%d]->d[%d] %p\n",i,r,dlsch->harq_processes[i]->d[r]);
               dlsch->harq_processes[i]->d[r][j] = LTE_NULL;
+            }
         }
       }
 
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
index 61d91cc217..cce09acd41 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
@@ -64,18 +64,18 @@ unsigned char offset_mumimo_llr_drange[29][3]={{8,8,8},{7,7,7},{7,7,7},{7,7,7},{
 {5,5,4},{5,5,5},{5,5,5},{3,3,3},{2,2,2},{2,2,2},{2,2,2}, // 16-QAM
 {2,2,1},{3,3,3},{3,3,3},{3,3,1},{2,2,2},{2,2,2},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}}; //64-QAM
 */
-/*
-//first optimization try
-unsigned char offset_mumimo_llr_drange[29][3]={{7, 8, 7},{6, 6, 7},{6, 6, 7},{6, 6, 6},{5, 6, 6},{5, 5, 6},{5, 5, 6},{4, 5, 4},{4, 3, 4},{3, 2, 2},{6, 5, 5},{5, 4, 4},{5, 5, 4},{3, 3, 2},{2, 2, 1},{2, 1, 1},{2, 2, 2},{3, 3, 3},{3, 3, 2},{3, 3, 2},{3, 2, 1},{2, 2, 2},{2, 2, 2},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}};
-*/
-//second optimization try
-/*
-  unsigned char offset_mumimo_llr_drange[29][3]={{5, 8, 7},{4, 6, 8},{3, 6, 7},{7, 7, 6},{4, 7, 8},{4, 7, 4},{6, 6, 6},{3, 6, 6},{3, 6, 6},{1, 3, 4},{1, 1, 0},{3, 3, 2},{3, 4, 1},{4, 0, 1},{4, 2, 2},{3, 1, 2},{2, 1, 0},{2, 1, 1},{1, 0, 1},{1, 0, 1},{0, 0, 0},{1, 0, 0},{0, 0, 0},{0, 1, 0},{1, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}};  w
-*/
+ /*
+ //first optimization try
+ unsigned char offset_mumimo_llr_drange[29][3]={{7, 8, 7},{6, 6, 7},{6, 6, 7},{6, 6, 6},{5, 6, 6},{5, 5, 6},{5, 5, 6},{4, 5, 4},{4, 3, 4},{3, 2, 2},{6, 5, 5},{5, 4, 4},{5, 5, 4},{3, 3, 2},{2, 2, 1},{2, 1, 1},{2, 2, 2},{3, 3, 3},{3, 3, 2},{3, 3, 2},{3, 2, 1},{2, 2, 2},{2, 2, 2},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}};
+ */
+ //second optimization try
+ /*
+   unsigned char offset_mumimo_llr_drange[29][3]={{5, 8, 7},{4, 6, 8},{3, 6, 7},{7, 7, 6},{4, 7, 8},{4, 7, 4},{6, 6, 6},{3, 6, 6},{3, 6, 6},{1, 3, 4},{1, 1, 0},{3, 3, 2},{3, 4, 1},{4, 0, 1},{4, 2, 2},{3, 1, 2},{2, 1, 0},{2, 1, 1},{1, 0, 1},{1, 0, 1},{0, 0, 0},{1, 0, 0},{0, 0, 0},{0, 1, 0},{1, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}};  w
+ */
 unsigned char offset_mumimo_llr_drange[29][3]= {{0, 6, 5},{0, 4, 5},{0, 4, 5},{0, 5, 4},{0, 5, 6},{0, 5, 3},{0, 4, 4},{0, 4, 4},{0, 3, 3},{0, 1, 2},{1, 1, 0},{1, 3, 2},{3, 4, 1},{2, 0, 0},{2, 2, 2},{1, 1, 1},{2, 1, 0},{2, 1, 1},{1, 0, 1},{1, 0, 1},{0, 0, 0},{1, 0, 0},{0, 0, 0},{0, 1, 0},{1, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0},{0, 0, 0}};
 
 
-extern void print_shorts(char *s,__m128i *x);
+extern void print_shorts(char *s,int16_t *x);
 
 int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
              PDSCH_t type,
@@ -645,11 +645,11 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
 
       if (get_Qm(dlsch1_harq->mcs) == 2) {
         /*  dlsch_qpsk_llr(frame_parms,
-                 lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0,
-                 lte_ue_pdsch_vars[eNB_id]->llr[0],
-                 symbol,first_symbol_flag,nb_rb,
-                 adjust_G2(frame_parms,dlsch0_harq->rb_alloc,2,subframe,symbol),
-                 lte_ue_pdsch_vars[eNB_id]->llr128);
+	    lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0,
+	    lte_ue_pdsch_vars[eNB_id]->llr[0],
+	    symbol,first_symbol_flag,nb_rb,
+	    adjust_G2(frame_parms,dlsch0_harq->rb_alloc,2,subframe,symbol),
+	    lte_ue_pdsch_vars[eNB_id]->llr128);
         */
         dlsch_qpsk_qpsk_llr(frame_parms,
                             lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0,
@@ -868,7 +868,9 @@ void dlsch_channel_compensation(int **rxdataF_ext,
           dl_ch_mag128b[0] = dl_ch_mag128[0];
           dl_ch_mag128[0] = _mm_mulhi_epi16(dl_ch_mag128[0],QAM_amp128);
           dl_ch_mag128[0] = _mm_slli_epi16(dl_ch_mag128[0],1);
-
+	  //print_ints("Re(ch):",(int16_t*)&mmtmpD0);
+	  //print_shorts("QAM_amp:",(int16_t*)&QAM_amp128);
+	  //print_shorts("mag:",(int16_t*)&dl_ch_mag128[0]);
           dl_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpD0,mmtmpD0);
           dl_ch_mag128b[1] = dl_ch_mag128[1];
           dl_ch_mag128[1] = _mm_mulhi_epi16(dl_ch_mag128[1],QAM_amp128);
@@ -1068,12 +1070,14 @@ void dlsch_channel_compensation(int **rxdataF_ext,
   unsigned short rb;
   unsigned char aatx,aarx,symbol_mod,pilots=0;
 
-  int16x4_t *dl_ch128,*dl_ch128_2,*rxdataF128,*rho128;
-  int32x4_t mmtmpD0,mmtmpD1;
-  int16x8_t *dl_ch_mag128,*dl_ch_mag128b,mmtmpD2,mmtmpD3,*rxdataF_comp128;
-  int16x4_t QAM_amp128,QAM_amp128b;
+  int16x4_t *dl_ch128,*dl_ch128_2,*rxdataF128;
+  int32x4_t mmtmpD0,mmtmpD1,mmtmpD0b,mmtmpD1b;
+  int16x8_t *dl_ch_mag128,*dl_ch_mag128b,mmtmpD2,mmtmpD3,mmtmpD4;
+  int16x8_t QAM_amp128,QAM_amp128b;
+  int16x4x2_t *rxdataF_comp128,*rho128;
 
   int16_t conj[4]__attribute__((aligned(16))) = {1,-1,1,-1};
+  int32x4_t output_shift128 = vmovq_n_s32(-(int32_t)output_shift);
 
   symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
 
@@ -1081,7 +1085,6 @@ void dlsch_channel_compensation(int **rxdataF_ext,
     if (frame_parms->mode1_flag==1) { // 10 out of 12 so don't reduce size
       nb_rb=1+(5*nb_rb/6);
     }
-
     else {
       pilots=1;
     }
@@ -1089,177 +1092,177 @@ void dlsch_channel_compensation(int **rxdataF_ext,
 
   for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) {
     if (mod_order == 4) {
-      QAM_amp128  = vmov_n_s16(QAM16_n1);  // 2/sqrt(10)
-      QAM_amp128b = vmov_n_s16(0);
-
+      QAM_amp128  = vmovq_n_s16(QAM16_n1);  // 2/sqrt(10)
+      QAM_amp128b = vmovq_n_s16(0);
     } else if (mod_order == 6) {
-      QAM_amp128  = vmov_n_s16(QAM64_n1); //
-      QAM_amp128b = vmov_n_s16(QAM64_n2);
+      QAM_amp128  = vmovq_n_s16(QAM64_n1); //
+      QAM_amp128b = vmovq_n_s16(QAM64_n2);
     }
-
     //    printf("comp: rxdataF_comp %p, symbol %d\n",rxdataF_comp[0],symbol);
 
     for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
-
-
-
       dl_ch128          = (int16x4_t*)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12];
       dl_ch_mag128      = (int16x8_t*)&dl_ch_mag[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12];
       dl_ch_mag128b     = (int16x8_t*)&dl_ch_magb[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12];
       rxdataF128        = (int16x4_t*)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
-      rxdataF_comp128   = (int16x8_t*)&rxdataF_comp[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12];
-
+      rxdataF_comp128   = (int16x4x2_t*)&rxdataF_comp[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12];
+	  
       for (rb=0; rb<nb_rb; rb++) {
-        if (mod_order>2) {
-          // get channel amplitude if not QPSK
-          mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128[0]);
-          // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3];
-          mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-          // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift on 32-bits
-          mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128[1]);
-          mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-          mmtmpD2 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-          // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift on 16-bits
-          mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128[2]);
-          mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-          mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128[3]);
-          mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-          mmtmpD3 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-          if (pilots==0) {
-            mmtmpD0 = vmull_s16(dl_ch128[4], dl_ch128[4]);
-            mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-            mmtmpD1 = vmull_s16(dl_ch128[5], dl_ch128[5]);
-            mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-            mmtmpD4 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-
-          }
-
-          dl_ch_mag128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128b);
-          dl_ch_mag128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128b);
-          dl_ch_mag128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128);
-          dl_ch_mag128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128);
-
-
-          if (pilots==0) {
-            dl_ch_mag128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128b);
-            dl_ch_mag128[2]  = vqdmulhq_s16(mmtmpD4,QAM_amp128);
-          }
-        }
-
-        mmtmpD0 = vmull_s16(dl_ch128[0], rx_dataF128[0]);
-        //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])]
-        mmtmpD1 = vmull_s16(dl_ch128[1], rx_dataF128[1]);
-        //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])]
-        mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-        //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])]
-
-        mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[0],*(int16x4_t*)conj)), rx_dataF128[0]);
-        //mmtmpD0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
-        mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[1],*(int16x4_t*)conj)), rx_dataF128[1]);
-        //mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
-        mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-        //mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
-
-        mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-        mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-        rxdataF_comp128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-        mmtmpD0 = vmull_s16(dl_ch128[2], rx_dataF128[2]);
-        mmtmpD1 = vmull_s16(dl_ch128[3], rx_dataF128[3]);
-        mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-        mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[2],*(int16x4_t*)conj)), rx_dataF128[2]);
-        mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[3],*(int16x4_t*)conj)), rx_dataF128[3]);
-        mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-        mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-        mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-        rxdataF_comp128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-        if (pilots==0) {
-          mmtmpD0 = vmull_s16(dl_ch128[4], rx_dataF128[4]);
-          mmtmpD1 = vmull_s16(dl_ch128[5], rx_dataF128[5]);
-          mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-          mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[4],*(int16x4_t*)conj)), rx_dataF128[4]);
-          mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[5],*(int16x4_t*)conj)), rx_dataF128[5]);
-          mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-          mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-          mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-          rxdataF_comp128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-
-          dl_ch128+=6;
-          dl_ch_mag128+=3;
-          dl_ch_mag128b+=3;
-          rxdataF128+=6;
-          rxdataF_comp128+=3;
-
-        } else { // we have a smaller PDSCH in symbols with pilots so skip last group of 4 REs and increment less
-          dl_ch128+=4;
-          dl_ch_mag128+=2;
-          dl_ch_mag128b+=2;
-          rxdataF128+=4;
-          rxdataF_comp128+=2;
-        }
+	if (mod_order>2) {
+	  // get channel amplitude if not QPSK
+	  mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128[0]);
+	  // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3];
+	  mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	  // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift128 on 32-bits
+	  mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128[1]);
+	  mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	  mmtmpD2 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	  // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift128 on 16-bits 
+	  mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128[2]);
+	  mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	  mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128[3]);
+	  mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	  mmtmpD3 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	  if (pilots==0) {
+	    mmtmpD0 = vmull_s16(dl_ch128[4], dl_ch128[4]);
+	    mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	    mmtmpD1 = vmull_s16(dl_ch128[5], dl_ch128[5]);
+	    mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	    mmtmpD4 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	  }
+
+	  dl_ch_mag128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128b);
+	  dl_ch_mag128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128b);
+	  dl_ch_mag128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128);
+	  dl_ch_mag128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128);
+
+	  if (pilots==0) {
+	    dl_ch_mag128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128b);
+	    dl_ch_mag128[2]  = vqdmulhq_s16(mmtmpD4,QAM_amp128);
+	  }
+	}
+	    
+	mmtmpD0 = vmull_s16(dl_ch128[0], rxdataF128[0]);
+	//mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] 
+	mmtmpD1 = vmull_s16(dl_ch128[1], rxdataF128[1]);
+	//mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] 
+	mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			       vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+	//mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 
+
+	mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[0],*(int16x4_t*)conj)), rxdataF128[0]);
+	//mmtmpD0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
+	mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[1],*(int16x4_t*)conj)), rxdataF128[1]);
+	//mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
+	mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			       vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	//mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
+
+	mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+	mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+	rxdataF_comp128[0] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	mmtmpD0 = vmull_s16(dl_ch128[2], rxdataF128[2]);
+	mmtmpD1 = vmull_s16(dl_ch128[3], rxdataF128[3]);
+	mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			       vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+	mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[2],*(int16x4_t*)conj)), rxdataF128[2]);
+	mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[3],*(int16x4_t*)conj)), rxdataF128[3]);
+	mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			       vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+	mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+	rxdataF_comp128[1] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+
+	if (pilots==0) {
+	  mmtmpD0 = vmull_s16(dl_ch128[4], rxdataF128[4]);
+	  mmtmpD1 = vmull_s16(dl_ch128[5], rxdataF128[5]);
+	  mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+				 vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+
+	  mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[4],*(int16x4_t*)conj)), rxdataF128[4]);
+	  mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[5],*(int16x4_t*)conj)), rxdataF128[5]);
+	  mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+				 vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+
+	      
+	  mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+	  mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+	  rxdataF_comp128[2] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	      
+	      
+	  dl_ch128+=6;
+	  dl_ch_mag128+=3;
+	  dl_ch_mag128b+=3;
+	  rxdataF128+=6;
+	  rxdataF_comp128+=3;
+	      
+	} else { // we have a smaller PDSCH in symbols with pilots so skip last group of 4 REs and increment less
+	  dl_ch128+=4;
+	  dl_ch_mag128+=2;
+	  dl_ch_mag128b+=2;
+	  rxdataF128+=4;
+	  rxdataF_comp128+=2;
+	}
       }
     }
   }
-
+    
   if (rho) {
     for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
-      rho128        = (int16x8_t*)&rho[aarx][symbol*frame_parms->N_RB_DL*12];
+      rho128        = (int16x4x2_t*)&rho[aarx][symbol*frame_parms->N_RB_DL*12];
       dl_ch128      = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
       dl_ch128_2    = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
-
       for (rb=0; rb<nb_rb; rb++) {
-
-        mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128_2[0]);
-        mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128_2[1]);
-        mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-        mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[0],*(int16x4_t*)conj)), dl_ch128_2[0]);
-        mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[1],*(int16x4_t*)conj)), dl_ch128_2[1]);
-        mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-        mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-        mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-        rho128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-        mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128_2[2]);
-        mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128_2[3]);
-        mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-        mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[2],*(int16x4_t*)conj)), dl_ch128_2[2]);
-        mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[3],*(int16x4_t*)conj)), dl_ch128_2[3]);
-        mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-        mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-        mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-        rho128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-        mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128_2[0]);
-        mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128_2[1]);
-        mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-        mmtmpD0 = vrev32q_s16(vmul_s16(dl_ch128[4],*(int16x4_t*)conj), dl_ch128_2[4]);
-        mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[5],*(int16x4_t*)conj)), dl_ch128_2[5]);
-        mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-        mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-        mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-        rho128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-
-        dl_ch128+=6;
-        dl_ch128_2+=6;
-        rho128+=3;
+	mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128_2[0]);
+	mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128_2[1]);
+	mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			       vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+	mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[0],*(int16x4_t*)conj)), dl_ch128_2[0]);
+	mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[1],*(int16x4_t*)conj)), dl_ch128_2[1]);
+	mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			       vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	      
+	mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+	mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+	rho128[0] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+
+	mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128_2[2]);
+	mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128_2[3]);
+	mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			       vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+	mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[2],*(int16x4_t*)conj)), dl_ch128_2[2]);
+	mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[3],*(int16x4_t*)conj)), dl_ch128_2[3]);
+	mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			       vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	      
+	mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+	mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+	rho128[1] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	    
+	mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128_2[0]);
+	mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128_2[1]);
+	mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			       vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+	mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[4],*(int16x4_t*)conj)), dl_ch128_2[4]);
+	mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch128[5],*(int16x4_t*)conj)), dl_ch128_2[5]);
+	mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			       vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	      
+	mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+	mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+	rho128[2] = vzip_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	      
+	      
+	dl_ch128+=6;
+	dl_ch128_2+=6;
+	rho128+=3;
       }
-
+	    
       if (first_symbol_flag==1) {
-        phy_measurements->rx_correlation[0][aarx] = signal_energy(&rho[aarx][symbol*frame_parms->N_RB_DL*12],rb*12);
+	phy_measurements->rx_correlation[0][aarx] = signal_energy(&rho[aarx][symbol*frame_parms->N_RB_DL*12],rb*12);
       }
     }
   }
-
 #endif
 }
 
@@ -1312,37 +1315,39 @@ void prec2A_TM56_128(unsigned char pmi,__m128i *ch0,__m128i *ch1)
 }
 
 #elif defined(__arm__)
-void prec2A_TM56_128(unsigned char pmi,int16x8_t* ch0,int16x8_t* ch1)
-{
+void prec2A_TM56_128(unsigned char pmi,int16x8_t* ch0,int16x8_t* ch1) {
+
   int16x8_t amp;
+  int16_t conj[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
+
   amp = vmovq_n_s16(ONE_OVER_SQRT2_Q15);
 
   switch (pmi) {
   case 0 :   // +1 +1
     //    print_shorts("phase 0 :ch0",ch0);
     //    print_shorts("phase 0 :ch1",ch1);
-    ch0[0] = vqadd_s16(ch0[0],ch1[0]);
+    ch0[0] = vqaddq_s16(ch0[0],ch1[0]);
     break;
-
+      
   case 1 :   // +1 -1
     //    print_shorts("phase 1 :ch0",ch0);
     //    print_shorts("phase 1 :ch1",ch1);
-    ch0[0] = vqsub_s16(ch0[0],ch1[0]);
+    ch0[0] = vqsubq_s16(ch0[0],ch1[0]);
     //    print_shorts("phase 1 :ch0-ch1",ch0);
     break;
-
+      
   case 2 :   // +1 +j
-    ch1[0] = vrev32q_s16(vmul_s16(ch1[0],*(int16x4_t*)conj));
-    ch0[0] = vqsub_s16(ch0[0],ch1[0]);
+    ch1[0] = vrev32q_s16(vmulq_s16(ch1[0],*(int16x8_t*)conj));
+    ch0[0] = vqsubq_s16(ch0[0],ch1[0]);
     break;   // +1 -j
-
+      
   case 3 :
-    ch1[0] = vrev32q_s16(vmul_s16(ch1[0],*(int16x4_t*)conj));
-    ch0[0] = vqadd_s16(ch0[0],ch1[0]);
+    ch1[0] = vrev32q_s16(vmulq_s16(ch1[0],*(int16x8_t*)conj));
+    ch0[0] = vqaddq_s16(ch0[0],ch1[0]);
     break;
   }
-
-  ch0[0] = vmulhq_s16(ch0[0],amp);
+    
+  ch0[0] = vqdmulhq_s16(ch0[0],amp);
 }
 
 #endif
@@ -1435,25 +1440,26 @@ void prec2A_TM4_128(int pmi,__m128i *ch0,__m128i *ch1)
 
 #elif defined(__arm__)
 
-void prec2A_TM4_128(int pmi,__m128i *ch0,__m128i *ch1)
+void prec2A_TM4_128(int pmi,int16x8_t *ch0,int16x8_t *ch1)
 {
-  int16x6_t amp;
+  int16x8_t amp;
   int16x8_t tmp0,tmp1;
+  int16_t conj[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
 
-  amp = = vmovq_n_s16(ONE_OVER_SQRT2_Q15);
+  amp = vmovq_n_s16(ONE_OVER_SQRT2_Q15);
 
   if (pmi == 0) {
-    ch0[0] = vqadd_s16(ch0[0],ch1[0]);
-    ch1[0] = vqsub_s16(ch0[0],ch1[0]);
+    ch0[0] = vqaddq_s16(ch0[0],ch1[0]);
+    ch1[0] = vqsubq_s16(ch0[0],ch1[0]);
   } else {
     tmp0 = ch0[0];
-    tmp1 = vrev32q_s16(vmul_s16(ch1[0],*(int16x4_t*)conj));
-    ch0[0] = vqadd_s16(tmp0,tmp1);
-    ch1[0] = vqsub_s16(tmp0,tmp1);
+    tmp1 = vrev32q_s16(vmulq_s16(ch1[0],*(int16x8_t*)conj));    
+    ch0[0] = vqaddq_s16(tmp0,tmp1);
+    ch1[0] = vqsubq_s16(tmp0,tmp1);
   }
 
-  ch0[0] = vmulhq_s16(ch0[0],amp);
-  ch1[0] = vmulhq_s16(ch1[0],amp);
+  ch0[0] = vqdmulhq_s16(ch0[0],amp);
+  ch1[0] = vqdmulhq_s16(ch1[0],amp);
 }
 #endif
 
@@ -1478,7 +1484,7 @@ void dlsch_channel_compensation_TM56(int **rxdataF_ext,
   unsigned short rb,Nre;
   __m128i *dl_ch0_128,*dl_ch1_128,*dl_ch_mag128,*dl_ch_mag128b,*rxdataF128,*rxdataF_comp128;
   unsigned char aarx=0,symbol_mod,pilots=0;
-  int precoded_signal_strength=0,rx_power_correction;
+  int precoded_signal_strength=0;
   __m128i mmtmpD0,mmtmpD1,mmtmpD2,mmtmpD3,QAM_amp128,QAM_amp128b;
 
   symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
@@ -1486,7 +1492,6 @@ void dlsch_channel_compensation_TM56(int **rxdataF_ext,
   if ((symbol_mod == 0) || (symbol_mod == (4-frame_parms->Ncp)))
     pilots=1;
 
-  rx_power_correction = 1;
 
   //printf("comp prec: symbol %d, pilots %d\n",symbol, pilots);
 
@@ -1652,7 +1657,7 @@ void dlsch_channel_compensation_TM56(int **rxdataF_ext,
     Nre = (pilots==0) ? 12 : 8;
 
     precoded_signal_strength += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre],
-                                  (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx]));
+						     (nb_rb*Nre))) - (phy_measurements->n0_power[aarx]));
   } // rx_antennas
 
   phy_measurements->precoded_cqi_dB[eNB_id][0] = dB_fixed2(precoded_signal_strength,phy_measurements->n0_power_tot);
@@ -1665,168 +1670,176 @@ void dlsch_channel_compensation_TM56(int **rxdataF_ext,
 
 #elif defined(__arm__)
 
-  unsigned short rb;
-  unsigned char aatx,aarx,symbol_mod,pilots=0;
-
-  int16x4_t *dl_ch128,*dl_ch128_2,*rxdataF128,*rho128;
-  int32x4_t mmtmpD0,mmtmpD1;
-  int16x8_t *dl_ch_mag128,*dl_ch_mag128b,mmtmpD2,mmtmpD3,*rxdataF_comp128;
-  int16x4_t QAM_amp128,QAM_amp128b;
-
+  uint32_t rb,Nre;
+  uint32_t aarx,symbol_mod,pilots=0;
+  
+  int16x4_t *dl_ch0_128,*dl_ch1_128,*rxdataF128;
+  int16x8_t *dl_ch0_128b,*dl_ch1_128b;
+  int32x4_t mmtmpD0,mmtmpD1,mmtmpD0b,mmtmpD1b;
+  int16x8_t *dl_ch_mag128,*dl_ch_mag128b,mmtmpD2,mmtmpD3,mmtmpD4,*rxdataF_comp128;
+  int16x8_t QAM_amp128,QAM_amp128b;
+  
   int16_t conj[4]__attribute__((aligned(16))) = {1,-1,1,-1};
-
-  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
-
+  int32x4_t output_shift128 = vmovq_n_s32(-(int32_t)output_shift);
+  int32_t precoded_signal_strength=0;
+  
+  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;  
   if ((symbol_mod == 0) || (symbol_mod == (4-frame_parms->Ncp))) {
-    if (frame_parms->mode1_flag==1) { // 10 out of 12 so don't reduce size
-      nb_rb=1+(5*nb_rb/6);
-    }
-
-    else {
-      pilots=1;
-    }
+    if (frame_parms->mode1_flag==1) // 10 out of 12 so don't reduce size
+      { nb_rb=1+(5*nb_rb/6); }
+    
+    else
+      { pilots=1; }
   }
-
-
+  
+  
   if (mod_order == 4) {
-    QAM_amp128  = vmov_n_s16(QAM16_n1);  // 2/sqrt(10)
-    QAM_amp128b = vmov_n_s16(0);
-
+    QAM_amp128  = vmovq_n_s16(QAM16_n1);  // 2/sqrt(10)
+    QAM_amp128b = vmovq_n_s16(0);
+    
   } else if (mod_order == 6) {
-    QAM_amp128  = vmov_n_s16(QAM64_n1); //
-    QAM_amp128b = vmov_n_s16(QAM64_n2);
+    QAM_amp128  = vmovq_n_s16(QAM64_n1); //
+    QAM_amp128b = vmovq_n_s16(QAM64_n2);
   }
-
+  
   //    printf("comp: rxdataF_comp %p, symbol %d\n",rxdataF_comp[0],symbol);
-
+  
   for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
-
-
-
-    dl_ch1_128          = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
-    dl_ch2_128          = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
-    dl_ch_mag128      = (int16x8_t*)&dl_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12];
-    dl_ch_mag128b     = (int16x8_t*)&dl_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12];
-    rxdataF128        = (int16x4_t*)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
-    rxdataF_comp128   = (int16x8_t*)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12];
-
+    
+    
+    
+    dl_ch0_128          = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch1_128          = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch0_128b         = (int16x8_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch1_128b         = (int16x8_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch_mag128        = (int16x8_t*)&dl_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch_mag128b       = (int16x8_t*)&dl_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12];
+    rxdataF128          = (int16x4_t*)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
+    rxdataF_comp128     = (int16x8_t*)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12];
+    
     for (rb=0; rb<nb_rb; rb++) {
 #ifdef DEBUG_DLSCH_DEMOD
       printf("mode 6 prec: rb %d, pmi->%d\n",rb,pmi_ext[rb]);
 #endif
-      prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128[0],&dl_ch1_128[0]);
-      prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128[1],&dl_ch1_128[1]);
-
+      prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128b[0],&dl_ch1_128b[0]);
+      prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128b[1],&dl_ch1_128b[1]);
+      
       if (pilots==0) {
-        prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128[2],&dl_ch1_128[2]);
+	prec2A_TM56_128(pmi_ext[rb],&dl_ch0_128b[2],&dl_ch1_128b[2]);
       }
-
+      
       if (mod_order>2) {
-        // get channel amplitude if not QPSK
-        mmtmpD0 = vmull_s16(dl_ch128[0], dl_ch128[0]);
-        // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3];
-        mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-        // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift on 32-bits
-        mmtmpD1 = vmull_s16(dl_ch128[1], dl_ch128[1]);
-        mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-        mmtmpD2 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-        // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift on 16-bits
-        mmtmpD0 = vmull_s16(dl_ch128[2], dl_ch128[2]);
-        mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-        mmtmpD1 = vmull_s16(dl_ch128[3], dl_ch128[3]);
-        mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-        mmtmpD3 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-        if (pilots==0) {
-          mmtmpD0 = vmull_s16(dl_ch128[4], dl_ch128[4]);
-          mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-          mmtmpD1 = vmull_s16(dl_ch128[5], dl_ch128[5]);
-          mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-          mmtmpD4 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-
-        }
-
-        dl_ch_mag128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128b);
-        dl_ch_mag128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128b);
-        dl_ch_mag128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128);
-        dl_ch_mag128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128);
-
-
-        if (pilots==0) {
-          dl_ch_mag128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128b);
-          dl_ch_mag128[2]  = vqdmulhq_s16(mmtmpD4,QAM_amp128);
-        }
+	// get channel amplitude if not QPSK
+	mmtmpD0 = vmull_s16(dl_ch0_128[0], dl_ch0_128[0]);
+	// mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3];
+	mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	// mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift128 on 32-bits
+	mmtmpD1 = vmull_s16(dl_ch0_128[1], dl_ch0_128[1]);
+	mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	mmtmpD2 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	// mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift128 on 16-bits 
+	mmtmpD0 = vmull_s16(dl_ch0_128[2], dl_ch0_128[2]);
+	mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	mmtmpD1 = vmull_s16(dl_ch0_128[3], dl_ch0_128[3]);
+	mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	mmtmpD3 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	if (pilots==0) {
+	  mmtmpD0 = vmull_s16(dl_ch0_128[4], dl_ch0_128[4]);
+	  mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	  mmtmpD1 = vmull_s16(dl_ch0_128[5], dl_ch0_128[5]);
+	  mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	  mmtmpD4 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	  
+	  
+	}
+	
+	dl_ch_mag128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128b);
+	dl_ch_mag128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128b);
+	dl_ch_mag128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp128);
+	dl_ch_mag128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp128);
+	
+	
+	if (pilots==0) {
+	  dl_ch_mag128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp128b);
+	  dl_ch_mag128[2]  = vqdmulhq_s16(mmtmpD4,QAM_amp128);
+	}
       }
-
-      mmtmpD0 = vmull_s16(dl_ch128[0], rx_dataF128[0]);
-      //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])]
-      mmtmpD1 = vmull_s16(dl_ch128[1], rx_dataF128[1]);
-      //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])]
-      mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-      //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])]
-
-      mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[0],*(int16x4_t*)conj)), rx_dataF128[0]);
+      mmtmpD0 = vmull_s16(dl_ch0_128[0], rxdataF128[0]);
+      //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] 
+      mmtmpD1 = vmull_s16(dl_ch0_128[1], rxdataF128[1]);
+      //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] 
+      mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			     vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+      //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 
+      
+      mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rxdataF128[0]);
       //mmtmpD0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
-      mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[1],*(int16x4_t*)conj)), rx_dataF128[1]);
+      mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rxdataF128[1]);
       //mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
-      mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
+      mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			     vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
       //mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
-
-      mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-      mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-      rxdataF_comp128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-      mmtmpD0 = vmull_s16(dl_ch128[2], rx_dataF128[2]);
-      mmtmpD1 = vmull_s16(dl_ch128[3], rx_dataF128[3]);
-      mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-      mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[2],*(int16x4_t*)conj)), rx_dataF128[2]);
-      mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[3],*(int16x4_t*)conj)), rx_dataF128[3]);
-      mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-      mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-      mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-      rxdataF_comp128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
+      
+      mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+      mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+      rxdataF_comp128[0] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+      
+      mmtmpD0 = vmull_s16(dl_ch0_128[2], rxdataF128[2]);
+      mmtmpD1 = vmull_s16(dl_ch0_128[3], rxdataF128[3]);
+      mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			     vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+      
+      mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rxdataF128[2]);
+      mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rxdataF128[3]);
+      mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			     vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+      
+      mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+      mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+      rxdataF_comp128[1] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+      
       if (pilots==0) {
-        mmtmpD0 = vmull_s16(dl_ch128[4], rx_dataF128[4]);
-        mmtmpD1 = vmull_s16(dl_ch128[5], rx_dataF128[5]);
-        mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-        mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[4],*(int16x4_t*)conj)), rx_dataF128[4]);
-        mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch128[5],*(int16x4_t*)conj)), rx_dataF128[5]);
-        mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-        mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-        mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-        rxdataF_comp128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-
-        dl_ch128+=6;
-        dl_ch_mag128+=3;
-        dl_ch_mag128b+=3;
-        rxdataF128+=6;
-        rxdataF_comp128+=3;
-
+	mmtmpD0 = vmull_s16(dl_ch0_128[4], rxdataF128[4]);
+	mmtmpD1 = vmull_s16(dl_ch0_128[5], rxdataF128[5]);
+	mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			       vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+	
+	mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[4],*(int16x4_t*)conj)), rxdataF128[4]);
+	mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[5],*(int16x4_t*)conj)), rxdataF128[5]);
+	mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			       vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	
+	
+	mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+	mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+	rxdataF_comp128[2] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	
+	
+	dl_ch0_128+=6;
+	dl_ch1_128+=6;
+	dl_ch_mag128+=3;
+	dl_ch_mag128b+=3;
+	rxdataF128+=6;
+	rxdataF_comp128+=3;
+	
       } else { // we have a smaller PDSCH in symbols with pilots so skip last group of 4 REs and increment less
-        dl_ch128+=4;
-        dl_ch_mag128+=2;
-        dl_ch_mag128b+=2;
-        rxdataF128+=4;
-        rxdataF_comp128+=2;
+	dl_ch0_128+=4;
+	dl_ch1_128+=4;
+	dl_ch_mag128+=2;
+	dl_ch_mag128b+=2;
+	rxdataF128+=4;
+	rxdataF_comp128+=2;
       }
     }
-
-
-
+    
     Nre = (pilots==0) ? 12 : 8;
-
+    
     precoded_signal_strength += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre],
-                                  (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx]));
+						     (nb_rb*Nre))) - (phy_measurements->n0_power[aarx]));
     // rx_antennas
   }
-
   phy_measurements->precoded_cqi_dB[eNB_id][0] = dB_fixed2(precoded_signal_strength,phy_measurements->n0_power_tot);
-
+  
 #endif
 }
 
@@ -1847,7 +1860,7 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
   unsigned short rb,Nre;
   __m128i *dl_ch0_128,*dl_ch1_128,*dl_ch_mag0_128,*dl_ch_mag1_128,*dl_ch_mag0_128b,*dl_ch_mag1_128b,*rxdataF128,*rxdataF_comp0_128,*rxdataF_comp1_128;
   unsigned char aarx=0,symbol_mod,pilots=0;
-  int precoded_signal_strength0=0,precoded_signal_strength1=0,rx_power_correction;
+  int precoded_signal_strength0=0,precoded_signal_strength1=0;
 
   int **rxdataF_ext           = lte_ue_pdsch_vars->rxdataF_ext;
   int **dl_ch_estimates_ext   = lte_ue_pdsch_vars->dl_ch_estimates_ext;
@@ -1865,7 +1878,6 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
   if ((symbol_mod == 0) || (symbol_mod == (4-frame_parms->Ncp)))
     pilots=1;
 
-  rx_power_correction = 1;
 
   //printf("comp prec: symbol %d, pilots %d\n",symbol, pilots);
 
@@ -2051,7 +2063,7 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
 
       rxdataF_comp0_128[1] = _mm_packs_epi32(mmtmpD2,mmtmpD3);
       //  print_shorts("rx:",rxdataF128+1);
-      //  print_shorts("ch:",dl_ch128+1);
+      //  print_shorts("ch:",dl_ch0_128+1);
       //  print_shorts("pack:",rxdataF_comp128+1);
 
       if (pilots==0) {
@@ -2070,7 +2082,7 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
 
         rxdataF_comp0_128[2] = _mm_packs_epi32(mmtmpD2,mmtmpD3);
         //  print_shorts("rx:",rxdataF128+2);
-        //  print_shorts("ch:",dl_ch128+2);
+        //  print_shorts("ch:",dl_ch0_128+2);
         //        print_shorts("pack:",rxdataF_comp128+2);
 
       } else {
@@ -2162,15 +2174,15 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
       }
 
     } // rb loop
+  }
+  
+  Nre = (pilots==0) ? 12 : 8;
 
-    Nre = (pilots==0) ? 12 : 8;
-
-    precoded_signal_strength0 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre],
-                                   (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx]));
+  precoded_signal_strength0 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre],
+                                 (nb_rb*Nre))) - (phy_measurements->n0_power[aarx]));
 
-    precoded_signal_strength1 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx+2][symbol*frame_parms->N_RB_DL*Nre],
-                                   (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx]));
-  } // rx_antennas
+  precoded_signal_strength1 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx+2][symbol*frame_parms->N_RB_DL*Nre],
+                                 (nb_rb*Nre))) - (phy_measurements->n0_power[aarx]));
 
   phy_measurements->precoded_cqi_dB[eNB_id][0] = dB_fixed2(precoded_signal_strength0,phy_measurements->n0_power_tot);
   phy_measurements->precoded_cqi_dB[eNB_id][1] = dB_fixed2(precoded_signal_strength1,phy_measurements->n0_power_tot);
@@ -2183,14 +2195,18 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
 
 #elif defined(__arm__)
 
-  unsigned short rb;
-  unsigned char aatx,aarx,symbol_mod,pilots=0;
-
-  int16x4_t *dl_ch128,*dl_ch128_2,*rxdataF128;
-  int32x4_t mmtmpD0,mmtmpD1;
-  int16x8_t *dl_ch_mag0_128,*dl_ch_mag1_128b,mmtmpD2,mmtmpD3,*rxdataF_comp0_128,*rxdataF_comp1_128;
-  int16x4_t QAM_amp0_128,QAM_amp1_128b;
-
+  unsigned short rb,Nre;
+  unsigned char aarx,symbol_mod,pilots=0;
+  int precoded_signal_strength0=0,precoded_signal_strength1=0;  
+  int16x4_t *dl_ch0_128,*rxdataF128;
+  int16x4_t *dl_ch1_128;
+  int16x8_t *dl_ch0_128b,*dl_ch1_128b;
+  
+  int32x4_t mmtmpD0,mmtmpD1,mmtmpD0b,mmtmpD1b;
+  int16x8_t *dl_ch_mag0_128,*dl_ch_mag0_128b,*dl_ch_mag1_128,*dl_ch_mag1_128b,mmtmpD2,mmtmpD3,mmtmpD4,*rxdataF_comp0_128,*rxdataF_comp1_128;
+  int16x8_t QAM_amp0_128,QAM_amp0_128b,QAM_amp1_128,QAM_amp1_128b;
+  int32x4_t output_shift128 = vmovq_n_s32(-(int32_t)output_shift);
+  
   int **rxdataF_ext           = lte_ue_pdsch_vars->rxdataF_ext;
   int **dl_ch_estimates_ext   = lte_ue_pdsch_vars->dl_ch_estimates_ext;
   int **dl_ch_mag0            = lte_ue_pdsch_vars->dl_ch_mag0;
@@ -2199,255 +2215,258 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
   int **dl_ch_magb1           = lte_ue_pdsch_vars->dl_ch_magb1;
   int **rxdataF_comp0         = lte_ue_pdsch_vars->rxdataF_comp0;
   int **rxdataF_comp1         = lte_ue_pdsch_vars->rxdataF_comp1[round]; //?
-
+  
   int16_t conj[4]__attribute__((aligned(16))) = {1,-1,1,-1};
-
+  
   symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
-
+  
   if ((symbol_mod == 0) || (symbol_mod == (4-frame_parms->Ncp))) {
-    if (frame_parms->mode1_flag==1) { // 10 out of 12 so don't reduce size
-      nb_rb=1+(5*nb_rb/6);
-    }
-
-    else {
-      pilots=1;
-    }
+    if (frame_parms->mode1_flag==1) // 10 out of 12 so don't reduce size
+      { nb_rb=1+(5*nb_rb/6); }
+    
+    else
+      { pilots=1; }
   }
-
-
+  
+  
   if (mod_order0 == 4) {
-    QAM_amp0_128  = vmov_n_s16(QAM16_n1);  // 2/sqrt(10)
-    QAM_amp0_128b = vmov_n_s16(0);
-
+    QAM_amp0_128  = vmovq_n_s16(QAM16_n1);  // 2/sqrt(10)
+    QAM_amp0_128b = vmovq_n_s16(0);
+    
   } else if (mod_order0 == 6) {
-    QAM_amp0_128  = vmov_n_s16(QAM64_n1); //
-    QAM_amp0_128b = vmov_n_s16(QAM64_n2);
+    QAM_amp0_128  = vmovq_n_s16(QAM64_n1); //
+    QAM_amp0_128b = vmovq_n_s16(QAM64_n2);
   }
-
+  
   if (mod_order1 == 4) {
-    QAM_amp1_128  = vmov_n_s16(QAM16_n1);  // 2/sqrt(10)
-    QAM_amp1_128b = vmov_n_s16(0);
-
+    QAM_amp1_128  = vmovq_n_s16(QAM16_n1);  // 2/sqrt(10)
+    QAM_amp1_128b = vmovq_n_s16(0);
+    
   } else if (mod_order1 == 6) {
-    QAM_amp1_128  = vmov_n_s16(QAM64_n1); //
-    QAM_amp1_128b = vmov_n_s16(QAM64_n2);
+    QAM_amp1_128  = vmovq_n_s16(QAM64_n1); //
+    QAM_amp1_128b = vmovq_n_s16(QAM64_n2);
   }
-
+    
   //    printf("comp: rxdataF_comp %p, symbol %d\n",rxdataF_comp[0],symbol);
-
+    
   for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
-
-
-
-    dl_ch1_128          = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
-    dl_ch2_128          = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
+      
+      
+      
+    dl_ch0_128          = (int16x4_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch1_128          = (int16x4_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch0_128b          = (int16x8_t*)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch1_128b          = (int16x8_t*)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
     dl_ch_mag0_128      = (int16x8_t*)&dl_ch_mag0[aarx][symbol*frame_parms->N_RB_DL*12];
-    dl_ch_mag0_128b     = (int16x8_t*)&dl_ch_mag0b[aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch_mag0_128b     = (int16x8_t*)&dl_ch_magb0[aarx][symbol*frame_parms->N_RB_DL*12];
     dl_ch_mag1_128      = (int16x8_t*)&dl_ch_mag1[aarx][symbol*frame_parms->N_RB_DL*12];
-    dl_ch_mag1_128b     = (int16x8_t*)&dl_ch_mag1b[aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch_mag1_128b     = (int16x8_t*)&dl_ch_magb1[aarx][symbol*frame_parms->N_RB_DL*12];
     rxdataF128          = (int16x4_t*)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
     rxdataF_comp0_128   = (int16x8_t*)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12];
     rxdataF_comp1_128   = (int16x8_t*)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12];
-
+      
     for (rb=0; rb<nb_rb; rb++) {
 #ifdef DEBUG_DLSCH_DEMOD
       printf("mode 6 prec: rb %d, pmi->%d\n",rb,pmi_ext[rb]);
 #endif
 
-      prec2A_TM3_128(&dl_ch0_128[0],&dl_ch1_128[0]);
-      prec2A_TM3_128(&dl_ch0_128[1],&dl_ch1_128[1]);
-
+      prec2A_TM3_128(&dl_ch0_128b[0],&dl_ch1_128b[0]);
+      prec2A_TM3_128(&dl_ch0_128b[1],&dl_ch1_128b[1]);
+	
       if (pilots==0) {
-        prec2A_TM3_128(&dl_ch0_128[2],&dl_ch1_128[2]);
+	prec2A_TM3_128(&dl_ch0_128b[2],&dl_ch1_128b[2]);
       }
 
-
+	
       if (mod_order0>2) {
-        // get channel amplitude if not QPSK
-        mmtmpD0 = vmull_s16(dl_ch0_128[0], dl_ch0_128[0]);
-        // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3];
-        mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-        // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift on 32-bits
-        mmtmpD1 = vmull_s16(dl_ch0_128[1], dl_ch0_128[1]);
-        mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-        mmtmpD2 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-        // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift on 16-bits
-        mmtmpD0 = vmull_s16(dl_ch0_128[2], dl_ch0_128[2]);
-        mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-        mmtmpD1 = vmull_s16(dl_ch0_128[3], dl_ch0_128[3]);
-        mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-        mmtmpD3 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-        if (pilots==0) {
-          mmtmpD0 = vmull_s16(dl_ch0_128[4], dl_ch0_128[4]);
-          mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-          mmtmpD1 = vmull_s16(dl_ch0_128[5], dl_ch0_128[5]);
-          mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-          mmtmpD4 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-
-        }
-
-        dl_ch_mag0_128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp0_128b);
-        dl_ch_mag0_128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp0_128b);
-        dl_ch_mag0_128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp0_128);
-        dl_ch_mag0_128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp0_128);
-
-
-        if (pilots==0) {
-          dl_ch_mag0_128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp0_128b);
-          dl_ch_mag0_128[2]  = vqdmulhq_s16(mmtmpD4,QAM_amp0_128);
-        }
+	// get channel amplitude if not QPSK
+	mmtmpD0 = vmull_s16(dl_ch0_128[0], dl_ch0_128[0]);
+	// mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3];
+	mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	// mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift128 on 32-bits
+	mmtmpD1 = vmull_s16(dl_ch0_128[1], dl_ch0_128[1]);
+	mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	mmtmpD2 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	// mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift128 on 16-bits 
+	mmtmpD0 = vmull_s16(dl_ch0_128[2], dl_ch0_128[2]);
+	mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	mmtmpD1 = vmull_s16(dl_ch0_128[3], dl_ch0_128[3]);
+	mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	mmtmpD3 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	  
+	if (pilots==0) {
+	  mmtmpD0 = vmull_s16(dl_ch0_128[4], dl_ch0_128[4]);
+	  mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	  mmtmpD1 = vmull_s16(dl_ch0_128[5], dl_ch0_128[5]);
+	  mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	  mmtmpD4 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	    
+	    
+	}
+	  
+	dl_ch_mag0_128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp0_128b);
+	dl_ch_mag0_128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp0_128b);
+	dl_ch_mag0_128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp0_128);
+	dl_ch_mag0_128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp0_128);
+	  
+	  
+	if (pilots==0) {
+	  dl_ch_mag0_128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp0_128b);
+	  dl_ch_mag0_128[2]  = vqdmulhq_s16(mmtmpD4,QAM_amp0_128);
+	}
       }
 
       if (mod_order1>2) {
-        // get channel amplitude if not QPSK
-        mmtmpD0 = vmull_s16(dl_ch1_128[0], dl_ch1_128[0]);
-        // mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3];
-        mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-        // mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift on 32-bits
-        mmtmpD1 = vmull_s16(dl_ch1_128[1], dl_ch1_128[1]);
-        mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-        mmtmpD2 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-        // mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift on 16-bits
-        mmtmpD0 = vmull_s16(dl_ch1_128[2], dl_ch1_128[2]);
-        mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-        mmtmpD1 = vmull_s16(dl_ch1_128[3], dl_ch1_128[3]);
-        mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-        mmtmpD3 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-        if (pilots==0) {
-          mmtmpD0 = vmull_s16(dl_ch1_128[4], dl_ch1_128[4]);
-          mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),-output_shift);
-          mmtmpD1 = vmull_s16(dl_ch1_128[5], dl_ch1_128[5]);
-          mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),-output_shift);
-          mmtmpD4 = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-
-        }
-
-        dl_ch_mag1_128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp1_128b);
-        dl_ch_mag1_128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp1_128b);
-        dl_ch_mag1_128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp1_128);
-        dl_ch_mag1_128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp1_128);
-
-
-        if (pilots==0) {
-          dl_ch_mag1_128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp1_128b);
-          dl_ch_mag1_128[2]  = vqdmulhq_s16(mmtmpD4,QAM_amp1_128);
-        }
+	// get channel amplitude if not QPSK
+	mmtmpD0 = vmull_s16(dl_ch1_128[0], dl_ch1_128[0]);
+	// mmtmpD0 = [ch0*ch0,ch1*ch1,ch2*ch2,ch3*ch3];
+	mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	// mmtmpD0 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3]>>output_shift128 on 32-bits
+	mmtmpD1 = vmull_s16(dl_ch1_128[1], dl_ch1_128[1]);
+	mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	mmtmpD2 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	// mmtmpD2 = [ch0*ch0 + ch1*ch1,ch0*ch0 + ch1*ch1,ch2*ch2 + ch3*ch3,ch2*ch2 + ch3*ch3,ch4*ch4 + ch5*ch5,ch4*ch4 + ch5*ch5,ch6*ch6 + ch7*ch7,ch6*ch6 + ch7*ch7]>>output_shift128 on 16-bits 
+	mmtmpD0 = vmull_s16(dl_ch1_128[2], dl_ch1_128[2]);
+	mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	mmtmpD1 = vmull_s16(dl_ch1_128[3], dl_ch1_128[3]);
+	mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	mmtmpD3 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	  
+	if (pilots==0) {
+	  mmtmpD0 = vmull_s16(dl_ch1_128[4], dl_ch1_128[4]);
+	  mmtmpD0 = vqshlq_s32(vqaddq_s32(mmtmpD0,vrev64q_s32(mmtmpD0)),output_shift128);
+	  mmtmpD1 = vmull_s16(dl_ch1_128[5], dl_ch1_128[5]);
+	  mmtmpD1 = vqshlq_s32(vqaddq_s32(mmtmpD1,vrev64q_s32(mmtmpD1)),output_shift128);
+	  mmtmpD4 = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	    
+	    
+	}
+	  
+	dl_ch_mag1_128b[0] = vqdmulhq_s16(mmtmpD2,QAM_amp1_128b);
+	dl_ch_mag1_128b[1] = vqdmulhq_s16(mmtmpD3,QAM_amp1_128b);
+	dl_ch_mag1_128[0] = vqdmulhq_s16(mmtmpD2,QAM_amp1_128);
+	dl_ch_mag1_128[1] = vqdmulhq_s16(mmtmpD3,QAM_amp1_128);
+	  
+	  
+	if (pilots==0) {
+	  dl_ch_mag1_128b[2] = vqdmulhq_s16(mmtmpD4,QAM_amp1_128b);
+	  dl_ch_mag1_128[2]  = vqdmulhq_s16(mmtmpD4,QAM_amp1_128);
+	}
       }
-
-      mmtmpD0 = vmull_s16(dl_ch0_128[0], rx_dataF128[0]);
-      //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])]
-      mmtmpD1 = vmull_s16(dl_ch0_128[1], rx_dataF128[1]);
-      //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])]
-      mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-      //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])]
-
-      mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rx_dataF128[0]);
+	
+      mmtmpD0 = vmull_s16(dl_ch0_128[0], rxdataF128[0]);
+      //mmtmpD0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] 
+      mmtmpD1 = vmull_s16(dl_ch0_128[1], rxdataF128[1]);
+      //mmtmpD1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] 
+      mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			     vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+      //mmtmpD0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 
+	
+      mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rxdataF128[0]);
       //mmtmpD0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
-      mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rx_dataF128[1]);
+      mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rxdataF128[1]);
       //mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
-      mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
+      mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			     vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
       //mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
-
-      mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-      mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-      rxdataF_comp0_128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-      mmtmpD0 = vmull_s16(dl_ch0_128[2], rx_dataF128[2]);
-      mmtmpD1 = vmull_s16(dl_ch0_128[3], rx_dataF128[3]);
-      mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-      mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rx_dataF128[2]);
-      mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rx_dataF128[3]);
-      mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-      mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-      mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-      rxdataF_comp0_128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-
-      mmtmpD0 = vmull_s16(dl_ch1_128[0], rx_dataF128[0]);
-      mmtmpD1 = vmull_s16(dl_ch1_128[1], rx_dataF128[1]);
-      mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-      mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rx_dataF128[0]);
-      mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rx_dataF128[1]);
-      mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-      mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-      mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-      rxdataF_comp1_128[0] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-      mmtmpD0 = vmull_s16(dl_ch1_128[2], rx_dataF128[2]);
-      mmtmpD1 = vmull_s16(dl_ch1_128[3], rx_dataF128[3]);
-      mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-      mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rx_dataF128[2]);
-      mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rx_dataF128[3]);
-      mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-      mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-      mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-      rxdataF_comp1_128[1] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
+	
+      mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+      mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+      rxdataF_comp0_128[0] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	
+      mmtmpD0 = vmull_s16(dl_ch0_128[2], rxdataF128[2]);
+      mmtmpD1 = vmull_s16(dl_ch0_128[3], rxdataF128[3]);
+      mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			     vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+      
+      mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rxdataF128[2]);
+      mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rxdataF128[3]);
+      mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			     vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	
+      mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+      mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+      rxdataF_comp0_128[1] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	
+      // second stream
+      mmtmpD0 = vmull_s16(dl_ch1_128[0], rxdataF128[0]);
+      mmtmpD1 = vmull_s16(dl_ch1_128[1], rxdataF128[1]);
+      mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			     vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+      mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[0],*(int16x4_t*)conj)), rxdataF128[0]);
+	
+      mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[1],*(int16x4_t*)conj)), rxdataF128[1]);
+      //mmtmpD0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
+      mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			     vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+      //mmtmpD1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
+	
+      mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+      mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+      rxdataF_comp1_128[0] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	
+      mmtmpD0 = vmull_s16(dl_ch1_128[2], rxdataF128[2]);
+      mmtmpD1 = vmull_s16(dl_ch1_128[3], rxdataF128[3]);
+      mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			     vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+	
+      mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[2],*(int16x4_t*)conj)), rxdataF128[2]);
+      mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[3],*(int16x4_t*)conj)), rxdataF128[3]);
+      mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			     vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	
+      mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+      mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+      rxdataF_comp1_128[1] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+     
       if (pilots==0) {
-        mmtmpD0 = vmull_s16(dl_ch0_128[4], rx_dataF128[4]);
-        mmtmpD1 = vmull_s16(dl_ch0_128[5], rx_dataF128[5]);
-        mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-        mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[4],*(int16x4_t*)conj)), rx_dataF128[4]);
-        mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch0_128[5],*(int16x4_t*)conj)), rx_dataF128[5]);
-        mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-        mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-        mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-        rxdataF_comp0_128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-        mmtmpD0 = vmull_s16(dl_ch1_128[4], rx_dataF128[4]);
-        mmtmpD1 = vmull_s16(dl_ch1_128[5], rx_dataF128[5]);
-        mmtmpD0 = vpadd_s32(mmtmpD0,mmtmpD1);
-        mmtmpD0 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch1_128[4],*(int16x4_t*)conj)), rx_dataF128[4]);
-        mmtmpD1 = vmull_s16(vrev32q_s16(vmulq_s16(dl_ch1_128[5],*(int16x4_t*)conj)), rx_dataF128[5]);
-        mmtmpD1 = vpadd_s32(mmtmpD0,mmtmpD1);
-
-        mmtmpD0 = vqshlq_s32(mmtmpD0,-output_shift);
-        mmtmpD1 = vqshlq_s32(mmtmpD1,-output_shift);
-        rxdataF_comp1_128[2] = vcombine_s16(vqmovn_s32(mmtmpD0),vwmovn_s32(mmtmpD1));
-
-
-        dl_ch0_128+=6;
-        dl_ch1_128+=6;
-        dl_ch_mag0_128+=3;
-        dl_ch_mag0_128b+=3;
-        dl_ch_mag1_128+=3;
-        dl_ch_mag1_128b+=3;
-        rxdataF128+=6;
-        rxdataF_comp0_128+=3;
-        rxdataF_comp1_128+=3;
-
-      } else { // we have a smaller PDSCH in symbols with pilots so skip last group of 4 REs and increment less
-        dl_ch0_128+=4;
-        dl_ch1_128+=4;
-        dl_ch_mag0_128+=2;
-        dl_ch_mag0_128b+=2;
-        dl_ch_mag1_128+=2;
-        dl_ch_mag1_128b+=2;
-        rxdataF128+=4;
-        rxdataF_comp0_128+=2;
-        rxdataF_comp1_128+=2;
+	mmtmpD0 = vmull_s16(dl_ch0_128[4], rxdataF128[4]);
+	mmtmpD1 = vmull_s16(dl_ch0_128[5], rxdataF128[5]);
+	mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			       vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+	  
+	mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[4],*(int16x4_t*)conj)), rxdataF128[4]);
+	mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch0_128[5],*(int16x4_t*)conj)), rxdataF128[5]);
+	mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			       vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	  
+	  
+	mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+	mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+	rxdataF_comp0_128[2] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
+	mmtmpD0 = vmull_s16(dl_ch1_128[4], rxdataF128[4]);
+	mmtmpD1 = vmull_s16(dl_ch1_128[5], rxdataF128[5]);
+	mmtmpD0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0),vget_high_s32(mmtmpD0)),
+			       vpadd_s32(vget_low_s32(mmtmpD1),vget_high_s32(mmtmpD1)));
+	  
+	mmtmpD0b = vmull_s16(vrev32_s16(vmul_s16(dl_ch1_128[4],*(int16x4_t*)conj)), rxdataF128[4]);
+	mmtmpD1b = vmull_s16(vrev32_s16(vmul_s16(dl_ch1_128[5],*(int16x4_t*)conj)), rxdataF128[5]);
+	mmtmpD1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpD0b),vget_high_s32(mmtmpD0b)),
+			       vpadd_s32(vget_low_s32(mmtmpD1b),vget_high_s32(mmtmpD1b)));
+	  
+	  
+	mmtmpD0 = vqshlq_s32(mmtmpD0,output_shift128);
+	mmtmpD1 = vqshlq_s32(mmtmpD1,output_shift128);
+	rxdataF_comp1_128[2] = vcombine_s16(vmovn_s32(mmtmpD0),vmovn_s32(mmtmpD1));
       }
     }
+      
+      
+      
+    Nre = (pilots==0) ? 12 : 8;
 
+    // rx_antennas
+  }
 
 
-    Nre = (pilots==0) ? 12 : 8;
+  Nre = (pilots==0) ? 12 : 8;
 
-    precoded_signal_strength0 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre],
-                                   (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx]));
+  precoded_signal_strength0 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*Nre],
+                                 (nb_rb*Nre))) - (phy_measurements->n0_power[aarx]));
 
-    precoded_signal_strength1 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx+2][symbol*frame_parms->N_RB_DL*Nre],
-                                   (nb_rb*Nre))*rx_power_correction) - (phy_measurements->n0_power[aarx]));
+  precoded_signal_strength1 += ((signal_energy_nodc(&dl_ch_estimates_ext[aarx+2][symbol*frame_parms->N_RB_DL*Nre],
+                                 (nb_rb*Nre))) - (phy_measurements->n0_power[aarx]));
 
-    // rx_antennas
-  }
 
   phy_measurements->precoded_cqi_dB[eNB_id][0] = dB_fixed2(precoded_signal_strength0,phy_measurements->n0_power_tot);
   phy_measurements->precoded_cqi_dB[eNB_id][1] = dB_fixed2(precoded_signal_strength1,phy_measurements->n0_power_tot);
@@ -2580,7 +2599,7 @@ void dlsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
   unsigned char aatx;
   int i;
   __m128i *rxdataF_comp128_0,*rxdataF_comp128_1,*rxdataF_comp128_i0,*rxdataF_comp128_i1,*dl_ch_mag128_0,*dl_ch_mag128_1,*dl_ch_mag128_0b,*dl_ch_mag128_1b,*rho128_0,*rho128_1,*rho128_i0,*rho128_i1,
-          *dl_ch_mag128_i0,*dl_ch_mag128_i1,*dl_ch_mag128_i0b,*dl_ch_mag128_i1b;
+    *dl_ch_mag128_i0,*dl_ch_mag128_i1,*dl_ch_mag128_i0b,*dl_ch_mag128_i1b;
 
   if (frame_parms->nb_antennas_rx>1) {
 
@@ -2638,6 +2657,62 @@ void dlsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
 
 #elif defined(__arm__)
 
+  unsigned char aatx;
+  int i;
+  int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1,*rxdataF_comp128_i0,*rxdataF_comp128_i1,*dl_ch_mag128_0,*dl_ch_mag128_1,*dl_ch_mag128_0b,*dl_ch_mag128_1b,*rho128_0,*rho128_1,*rho128_i0,*rho128_i1,*dl_ch_mag128_i0,*dl_ch_mag128_i1,*dl_ch_mag128_i0b,*dl_ch_mag128_i1b;
+
+  if (frame_parms->nb_antennas_rx>1) {
+
+    for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++) {
+
+      rxdataF_comp128_0   = (int16x8_t *)&rxdataF_comp[(aatx<<1)][symbol*frame_parms->N_RB_DL*12];
+      rxdataF_comp128_1   = (int16x8_t *)&rxdataF_comp[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12];
+      dl_ch_mag128_0      = (int16x8_t *)&dl_ch_mag[(aatx<<1)][symbol*frame_parms->N_RB_DL*12];
+      dl_ch_mag128_1      = (int16x8_t *)&dl_ch_mag[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12];
+      dl_ch_mag128_0b     = (int16x8_t *)&dl_ch_magb[(aatx<<1)][symbol*frame_parms->N_RB_DL*12];
+      dl_ch_mag128_1b     = (int16x8_t *)&dl_ch_magb[(aatx<<1)+1][symbol*frame_parms->N_RB_DL*12];
+
+      // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation)
+      for (i=0; i<nb_rb*3; i++) {
+        rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]);
+        dl_ch_mag128_0[i]    = vhaddq_s16(dl_ch_mag128_0[i],dl_ch_mag128_1[i]);
+        dl_ch_mag128_0b[i]   = vhaddq_s16(dl_ch_mag128_0b[i],dl_ch_mag128_1b[i]);
+      }
+    }
+
+    if (rho) {
+      rho128_0 = (int16x8_t *) &rho[0][symbol*frame_parms->N_RB_DL*12];
+      rho128_1 = (int16x8_t *) &rho[1][symbol*frame_parms->N_RB_DL*12];
+
+      for (i=0; i<nb_rb*3; i++) {
+        //  print_shorts("mrc rho0:",&rho128_0[i]);
+        //  print_shorts("mrc rho1:",&rho128_1[i]);
+        rho128_0[i] = vhaddq_s16(rho128_0[i],rho128_1[i]);
+      }
+    }
+
+
+    if (dual_stream_UE == 1) {
+      rho128_i0 = (int16x8_t *) &rho_i[0][symbol*frame_parms->N_RB_DL*12];
+      rho128_i1 = (int16x8_t *) &rho_i[1][symbol*frame_parms->N_RB_DL*12];
+      rxdataF_comp128_i0   = (int16x8_t *)&rxdataF_comp_i[0][symbol*frame_parms->N_RB_DL*12];
+      rxdataF_comp128_i1   = (int16x8_t *)&rxdataF_comp_i[1][symbol*frame_parms->N_RB_DL*12];
+
+      dl_ch_mag128_i0      = (int16x8_t *)&dl_ch_mag_i[0][symbol*frame_parms->N_RB_DL*12];
+      dl_ch_mag128_i1      = (int16x8_t *)&dl_ch_mag_i[1][symbol*frame_parms->N_RB_DL*12];
+      dl_ch_mag128_i0b     = (int16x8_t *)&dl_ch_magb_i[0][symbol*frame_parms->N_RB_DL*12];
+      dl_ch_mag128_i1b     = (int16x8_t *)&dl_ch_magb_i[1][symbol*frame_parms->N_RB_DL*12];
+
+      for (i=0; i<nb_rb*3; i++) {
+        rxdataF_comp128_i0[i] = vhaddq_s16(rxdataF_comp128_i0[i],rxdataF_comp128_i1[i]);
+        rho128_i0[i]          = vhaddq_s16(rho128_i0[i],rho128_i1[i]);
+
+        dl_ch_mag128_i0[i]    = vhaddq_s16(dl_ch_mag128_i0[i],dl_ch_mag128_i1[i]);
+        dl_ch_mag128_i0b[i]   = vhaddq_s16(dl_ch_mag128_i0b[i],dl_ch_mag128_i1b[i]);
+      }
+    }
+  }
+
 #endif
 }
 
@@ -2764,6 +2839,62 @@ void dlsch_channel_level(int **dl_ch_estimates_ext,
 
 #elif defined(__arm__)
 
+  short rb;
+  unsigned char aatx,aarx,nre=12,symbol_mod;
+  int32x4_t avg128D;
+  int16x4_t *dl_ch128;
+
+  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
+
+  for (aatx=0; aatx<frame_parms->nb_antennas_tx_eNB; aatx++)
+    for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
+      //clear average level
+      avg128D = vdupq_n_s32(0);
+      // 5 is always a symbol with no pilots for both normal and extended prefix
+
+      dl_ch128=(int16x4_t *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol*frame_parms->N_RB_DL*12];
+
+      for (rb=0; rb<nb_rb; rb++) {
+        //  printf("rb %d : ",rb);
+        //  print_shorts("ch",&dl_ch128[0]);
+        avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[0],dl_ch128[0]));
+        avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[1],dl_ch128[1]));
+        avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[2],dl_ch128[2]));
+        avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[3],dl_ch128[3]));
+
+        if (((symbol_mod == 0) || (symbol_mod == (frame_parms->Ncp-1)))&&(frame_parms->mode1_flag==0)) {
+          dl_ch128+=4;
+        } else {
+          avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[4],dl_ch128[4]));
+          avg128D = vqaddq_s32(avg128D,vmull_s16(dl_ch128[5],dl_ch128[5]));
+          dl_ch128+=6;
+        }
+
+        /*
+          if (rb==0) {
+          print_shorts("dl_ch128",&dl_ch128[0]);
+          print_shorts("dl_ch128",&dl_ch128[1]);
+          print_shorts("dl_ch128",&dl_ch128[2]);
+          }
+        */
+      }
+
+      if (((symbol_mod == 0) || (symbol_mod == (frame_parms->Ncp-1)))&&(frame_parms->mode1_flag==0))
+        nre=8;
+      else if (((symbol_mod == 0) || (symbol_mod == (frame_parms->Ncp-1)))&&(frame_parms->mode1_flag==1))
+        nre=10;
+      else
+        nre=12;
+
+      avg[(aatx<<1)+aarx] = (((int32_t*)&avg128D)[0] +
+                             ((int32_t*)&avg128D)[1] +
+                             ((int32_t*)&avg128D)[2] +
+                             ((int32_t*)&avg128D)[3])/(nb_rb*nre);
+
+      //            printf("Channel level : %d\n",avg[(aatx<<1)+aarx]);
+    }
+
+
 #endif
 }
 
@@ -2832,9 +2963,9 @@ void dlsch_channel_level_TM3(int **dl_ch_estimates_ext,
     }
 
     avg[aarx] = (((int*)&avg128D)[0])/(nb_rb*nre) +
-                (((int*)&avg128D)[1])/(nb_rb*nre) +
-                (((int*)&avg128D)[2])/(nb_rb*nre) +
-                (((int*)&avg128D)[3])/(nb_rb*nre);
+      (((int*)&avg128D)[1])/(nb_rb*nre) +
+      (((int*)&avg128D)[2])/(nb_rb*nre) +
+      (((int*)&avg128D)[3])/(nb_rb*nre);
   }
 
   // choose maximum of the 2 effective channels
@@ -2915,9 +3046,9 @@ void dlsch_channel_level_TM56(int **dl_ch_estimates_ext,
     }
 
     avg[aarx] = (((int*)&avg128D)[0])/(nb_rb*nre) +
-                (((int*)&avg128D)[1])/(nb_rb*nre) +
-                (((int*)&avg128D)[2])/(nb_rb*nre) +
-                (((int*)&avg128D)[3])/(nb_rb*nre);
+      (((int*)&avg128D)[1])/(nb_rb*nre) +
+      (((int*)&avg128D)[2])/(nb_rb*nre) +
+      (((int*)&avg128D)[3])/(nb_rb*nre);
   }
 
   // choose maximum of the 2 effective channels
@@ -3109,17 +3240,17 @@ unsigned short dlsch_extract_rbs_single(int **rxdataF,
           memcpy(dl_ch0_ext,dl_ch0,12*sizeof(int));
 
           /*
-              printf("rb %d\n",rb);
-              for (i=0;i<12;i++)
-              printf("(%d %d)",((short *)dl_ch0)[i<<1],((short*)dl_ch0)[1+(i<<1)]);
-              printf("\n");
+	    printf("rb %d\n",rb);
+	    for (i=0;i<12;i++)
+	    printf("(%d %d)",((short *)dl_ch0)[i<<1],((short*)dl_ch0)[1+(i<<1)]);
+	    printf("\n");
           */
           if (pilots==0) {
             for (i=0; i<12; i++) {
               rxF_ext[i]=rxF[i];
               /*
-              printf("%d : (%d,%d)\n",(rxF+i-&rxdataF[aarx][( (symbol*(frame_parms->ofdm_symbol_size)))]),
-              ((short*)&rxF[i])[0],((short*)&rxF[i])[1]);*/
+		printf("%d : (%d,%d)\n",(rxF+i-&rxdataF[aarx][( (symbol*(frame_parms->ofdm_symbol_size)))]),
+		((short*)&rxF[i])[0],((short*)&rxF[i])[1]);*/
             }
 
             dl_ch0_ext+=12;
@@ -3461,7 +3592,7 @@ unsigned short dlsch_extract_rbs_single(int **rxdataF,
         if (rb_alloc_ind==1) {
           //    printf("rb %d/symbol %d (skip_half %d)\n",rb,l,skip_half);
           /*
-              printf("rb %d\n",rb);
+	    printf("rb %d\n",rb);
             for (i=0;i<12;i++)
             printf("(%d %d)",((short *)dl_ch0)[i<<1],((short*)dl_ch0)[1+(i<<1)]);
             printf("\n");
@@ -3545,8 +3676,6 @@ unsigned short dlsch_extract_rbs_single(int **rxdataF,
     }
   }
 
-  _mm_empty();
-  _m_empty();
 
   return(nb_rb/frame_parms->nb_antennas_rx);
 }
@@ -4172,9 +4301,6 @@ unsigned short dlsch_extract_rbs_dual(int **rxdataF,
   }
 
 
-  _mm_empty();
-  _m_empty();
-
   return(nb_rb/frame_parms->nb_antennas_rx);
 }
 
@@ -4266,7 +4392,7 @@ void print_bytes(char *s,__m128i *x)
   printf("%s  : %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,
          tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7],
          tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]
-        );
+	 );
 
 }
 
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
index 4990e94cf5..f0a32e09b8 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
@@ -25,18 +25,18 @@
 
   Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
 
- *******************************************************************************/
+*******************************************************************************/
 
 /*! \file PHY/LTE_TRANSPORT/dlsch_llr_computation.c
-* \brief Top-level routines for LLR computation of the PDSCH physical channel from 36-211, V8.6 2009-03
-* \author R. Knopp, F. Kaltenberger,A. Bhamri, S. Aubert, S. Wagner
-* \date 2011
-* \version 0.1
-* \company Eurecom
-* \email: knopp@eurecom.fr,florian.kaltenberger@eurecom.fr,ankit.bhamri@eurecom.fr,sebastien.aubert@eurecom.fr, sebastian.wagner@eurecom.fr
-* \note
-* \warning
-*/
+ * \brief Top-level routines for LLR computation of the PDSCH physical channel from 36-211, V8.6 2009-03
+ * \author R. Knopp, F. Kaltenberger,A. Bhamri, S. Aubert, S. Wagner
+ * \date 2011
+ * \version 0.1
+ * \company Eurecom
+ * \email: knopp@eurecom.fr,florian.kaltenberger@eurecom.fr,ankit.bhamri@eurecom.fr,sebastien.aubert@eurecom.fr, sebastian.wagner@eurecom.fr
+ * \note
+ * \warning
+ */
 
 #include "PHY/defs.h"
 #include "PHY/extern.h"
@@ -44,567 +44,563 @@
 #include "extern.h"
 #include "PHY/sse_intrin.h"
 
-#ifndef USER_MODE
-#define NOCYGWIN_STATIC static
-#else
-#define NOCYGWIN_STATIC
-#endif
+int16_t zero[8] __attribute__ ((aligned(16))) = {0,0,0,0,0,0,0,0};
+int16_t ones[8] __attribute__ ((aligned(16))) = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};
+#if defined(__x86_64__) || defined(__i386__)
+__m128i rho_rpi __attribute__ ((aligned(16)));
+__m128i rho_rmi __attribute__ ((aligned(16)));
+__m128i rho_rpi_1_1 __attribute__ ((aligned(16)));
+__m128i rho_rpi_1_3 __attribute__ ((aligned(16)));
+__m128i rho_rpi_1_5 __attribute__ ((aligned(16)));
+__m128i rho_rpi_1_7 __attribute__ ((aligned(16)));
+__m128i rho_rpi_3_1 __attribute__ ((aligned(16)));
+__m128i rho_rpi_3_3 __attribute__ ((aligned(16)));
+__m128i rho_rpi_3_5 __attribute__ ((aligned(16)));
+__m128i rho_rpi_3_7 __attribute__ ((aligned(16)));
+__m128i rho_rpi_5_1 __attribute__ ((aligned(16)));
+__m128i rho_rpi_5_3 __attribute__ ((aligned(16)));
+__m128i rho_rpi_5_5 __attribute__ ((aligned(16)));
+__m128i rho_rpi_5_7 __attribute__ ((aligned(16)));
+__m128i rho_rpi_7_1 __attribute__ ((aligned(16)));
+__m128i rho_rpi_7_3 __attribute__ ((aligned(16)));
+__m128i rho_rpi_7_5 __attribute__ ((aligned(16)));
+__m128i rho_rpi_7_7 __attribute__ ((aligned(16)));
+__m128i rho_rmi_1_1 __attribute__ ((aligned(16)));
+__m128i rho_rmi_1_3 __attribute__ ((aligned(16)));
+__m128i rho_rmi_1_5 __attribute__ ((aligned(16)));
+__m128i rho_rmi_1_7 __attribute__ ((aligned(16)));
+__m128i rho_rmi_3_1 __attribute__ ((aligned(16)));
+__m128i rho_rmi_3_3 __attribute__ ((aligned(16)));
+__m128i rho_rmi_3_5 __attribute__ ((aligned(16)));
+__m128i rho_rmi_3_7 __attribute__ ((aligned(16)));
+__m128i rho_rmi_5_1 __attribute__ ((aligned(16)));
+__m128i rho_rmi_5_3 __attribute__ ((aligned(16)));
+__m128i rho_rmi_5_5 __attribute__ ((aligned(16)));
+__m128i rho_rmi_5_7 __attribute__ ((aligned(16)));
+__m128i rho_rmi_7_1 __attribute__ ((aligned(16)));
+__m128i rho_rmi_7_3 __attribute__ ((aligned(16)));
+__m128i rho_rmi_7_5 __attribute__ ((aligned(16)));
+__m128i rho_rmi_7_7 __attribute__ ((aligned(16)));
+
+__m128i psi_r_m7_m7 __attribute__ ((aligned(16)));
+__m128i psi_r_m7_m5 __attribute__ ((aligned(16)));
+__m128i psi_r_m7_m3 __attribute__ ((aligned(16)));
+__m128i psi_r_m7_m1 __attribute__ ((aligned(16)));
+__m128i psi_r_m7_p1 __attribute__ ((aligned(16)));
+__m128i psi_r_m7_p3 __attribute__ ((aligned(16)));
+__m128i psi_r_m7_p5 __attribute__ ((aligned(16)));
+__m128i psi_r_m7_p7 __attribute__ ((aligned(16)));
+__m128i psi_r_m5_m7 __attribute__ ((aligned(16)));
+__m128i psi_r_m5_m5 __attribute__ ((aligned(16)));
+__m128i psi_r_m5_m3 __attribute__ ((aligned(16)));
+__m128i psi_r_m5_m1 __attribute__ ((aligned(16)));
+__m128i psi_r_m5_p1 __attribute__ ((aligned(16)));
+__m128i psi_r_m5_p3 __attribute__ ((aligned(16)));
+__m128i psi_r_m5_p5 __attribute__ ((aligned(16)));
+__m128i psi_r_m5_p7 __attribute__ ((aligned(16)));
+__m128i psi_r_m3_m7 __attribute__ ((aligned(16)));
+__m128i psi_r_m3_m5 __attribute__ ((aligned(16)));
+__m128i psi_r_m3_m3 __attribute__ ((aligned(16)));
+__m128i psi_r_m3_m1 __attribute__ ((aligned(16)));
+__m128i psi_r_m3_p1 __attribute__ ((aligned(16)));
+__m128i psi_r_m3_p3 __attribute__ ((aligned(16)));
+__m128i psi_r_m3_p5 __attribute__ ((aligned(16)));
+__m128i psi_r_m3_p7 __attribute__ ((aligned(16)));
+__m128i psi_r_m1_m7 __attribute__ ((aligned(16)));
+__m128i psi_r_m1_m5 __attribute__ ((aligned(16)));
+__m128i psi_r_m1_m3 __attribute__ ((aligned(16)));
+__m128i psi_r_m1_m1 __attribute__ ((aligned(16)));
+__m128i psi_r_m1_p1 __attribute__ ((aligned(16)));
+__m128i psi_r_m1_p3 __attribute__ ((aligned(16)));
+__m128i psi_r_m1_p5 __attribute__ ((aligned(16)));
+__m128i psi_r_m1_p7 __attribute__ ((aligned(16)));
+__m128i psi_r_p1_m7 __attribute__ ((aligned(16)));
+__m128i psi_r_p1_m5 __attribute__ ((aligned(16)));
+__m128i psi_r_p1_m3 __attribute__ ((aligned(16)));
+__m128i psi_r_p1_m1 __attribute__ ((aligned(16)));
+__m128i psi_r_p1_p1 __attribute__ ((aligned(16)));
+__m128i psi_r_p1_p3 __attribute__ ((aligned(16)));
+__m128i psi_r_p1_p5 __attribute__ ((aligned(16)));
+__m128i psi_r_p1_p7 __attribute__ ((aligned(16)));
+__m128i psi_r_p3_m7 __attribute__ ((aligned(16)));
+__m128i psi_r_p3_m5 __attribute__ ((aligned(16)));
+__m128i psi_r_p3_m3 __attribute__ ((aligned(16)));
+__m128i psi_r_p3_m1 __attribute__ ((aligned(16)));
+__m128i psi_r_p3_p1 __attribute__ ((aligned(16)));
+__m128i psi_r_p3_p3 __attribute__ ((aligned(16)));
+__m128i psi_r_p3_p5 __attribute__ ((aligned(16)));
+__m128i psi_r_p3_p7 __attribute__ ((aligned(16)));
+__m128i psi_r_p5_m7 __attribute__ ((aligned(16)));
+__m128i psi_r_p5_m5 __attribute__ ((aligned(16)));
+__m128i psi_r_p5_m3 __attribute__ ((aligned(16)));
+__m128i psi_r_p5_m1 __attribute__ ((aligned(16)));
+__m128i psi_r_p5_p1 __attribute__ ((aligned(16)));
+__m128i psi_r_p5_p3 __attribute__ ((aligned(16)));
+__m128i psi_r_p5_p5 __attribute__ ((aligned(16)));
+__m128i psi_r_p5_p7 __attribute__ ((aligned(16)));
+__m128i psi_r_p7_m7 __attribute__ ((aligned(16)));
+__m128i psi_r_p7_m5 __attribute__ ((aligned(16)));
+__m128i psi_r_p7_m3 __attribute__ ((aligned(16)));
+__m128i psi_r_p7_m1 __attribute__ ((aligned(16)));
+__m128i psi_r_p7_p1 __attribute__ ((aligned(16)));
+__m128i psi_r_p7_p3 __attribute__ ((aligned(16)));
+__m128i psi_r_p7_p5 __attribute__ ((aligned(16)));
+__m128i psi_r_p7_p7 __attribute__ ((aligned(16)));
+
+__m128i psi_i_m7_m7 __attribute__ ((aligned(16)));
+__m128i psi_i_m7_m5 __attribute__ ((aligned(16)));
+__m128i psi_i_m7_m3 __attribute__ ((aligned(16)));
+__m128i psi_i_m7_m1 __attribute__ ((aligned(16)));
+__m128i psi_i_m7_p1 __attribute__ ((aligned(16)));
+__m128i psi_i_m7_p3 __attribute__ ((aligned(16)));
+__m128i psi_i_m7_p5 __attribute__ ((aligned(16)));
+__m128i psi_i_m7_p7 __attribute__ ((aligned(16)));
+__m128i psi_i_m5_m7 __attribute__ ((aligned(16)));
+__m128i psi_i_m5_m5 __attribute__ ((aligned(16)));
+__m128i psi_i_m5_m3 __attribute__ ((aligned(16)));
+__m128i psi_i_m5_m1 __attribute__ ((aligned(16)));
+__m128i psi_i_m5_p1 __attribute__ ((aligned(16)));
+__m128i psi_i_m5_p3 __attribute__ ((aligned(16)));
+__m128i psi_i_m5_p5 __attribute__ ((aligned(16)));
+__m128i psi_i_m5_p7 __attribute__ ((aligned(16)));
+__m128i psi_i_m3_m7 __attribute__ ((aligned(16)));
+__m128i psi_i_m3_m5 __attribute__ ((aligned(16)));
+__m128i psi_i_m3_m3 __attribute__ ((aligned(16)));
+__m128i psi_i_m3_m1 __attribute__ ((aligned(16)));
+__m128i psi_i_m3_p1 __attribute__ ((aligned(16)));
+__m128i psi_i_m3_p3 __attribute__ ((aligned(16)));
+__m128i psi_i_m3_p5 __attribute__ ((aligned(16)));
+__m128i psi_i_m3_p7 __attribute__ ((aligned(16)));
+__m128i psi_i_m1_m7 __attribute__ ((aligned(16)));
+__m128i psi_i_m1_m5 __attribute__ ((aligned(16)));
+__m128i psi_i_m1_m3 __attribute__ ((aligned(16)));
+__m128i psi_i_m1_m1 __attribute__ ((aligned(16)));
+__m128i psi_i_m1_p1 __attribute__ ((aligned(16)));
+__m128i psi_i_m1_p3 __attribute__ ((aligned(16)));
+__m128i psi_i_m1_p5 __attribute__ ((aligned(16)));
+__m128i psi_i_m1_p7 __attribute__ ((aligned(16)));
+__m128i psi_i_p1_m7 __attribute__ ((aligned(16)));
+__m128i psi_i_p1_m5 __attribute__ ((aligned(16)));
+__m128i psi_i_p1_m3 __attribute__ ((aligned(16)));
+__m128i psi_i_p1_m1 __attribute__ ((aligned(16)));
+__m128i psi_i_p1_p1 __attribute__ ((aligned(16)));
+__m128i psi_i_p1_p3 __attribute__ ((aligned(16)));
+__m128i psi_i_p1_p5 __attribute__ ((aligned(16)));
+__m128i psi_i_p1_p7 __attribute__ ((aligned(16)));
+__m128i psi_i_p3_m7 __attribute__ ((aligned(16)));
+__m128i psi_i_p3_m5 __attribute__ ((aligned(16)));
+__m128i psi_i_p3_m3 __attribute__ ((aligned(16)));
+__m128i psi_i_p3_m1 __attribute__ ((aligned(16)));
+__m128i psi_i_p3_p1 __attribute__ ((aligned(16)));
+__m128i psi_i_p3_p3 __attribute__ ((aligned(16)));
+__m128i psi_i_p3_p5 __attribute__ ((aligned(16)));
+__m128i psi_i_p3_p7 __attribute__ ((aligned(16)));
+__m128i psi_i_p5_m7 __attribute__ ((aligned(16)));
+__m128i psi_i_p5_m5 __attribute__ ((aligned(16)));
+__m128i psi_i_p5_m3 __attribute__ ((aligned(16)));
+__m128i psi_i_p5_m1 __attribute__ ((aligned(16)));
+__m128i psi_i_p5_p1 __attribute__ ((aligned(16)));
+__m128i psi_i_p5_p3 __attribute__ ((aligned(16)));
+__m128i psi_i_p5_p5 __attribute__ ((aligned(16)));
+__m128i psi_i_p5_p7 __attribute__ ((aligned(16)));
+__m128i psi_i_p7_m7 __attribute__ ((aligned(16)));
+__m128i psi_i_p7_m5 __attribute__ ((aligned(16)));
+__m128i psi_i_p7_m3 __attribute__ ((aligned(16)));
+__m128i psi_i_p7_m1 __attribute__ ((aligned(16)));
+__m128i psi_i_p7_p1 __attribute__ ((aligned(16)));
+__m128i psi_i_p7_p3 __attribute__ ((aligned(16)));
+__m128i psi_i_p7_p5 __attribute__ ((aligned(16)));
+__m128i psi_i_p7_p7 __attribute__ ((aligned(16)));
+
+__m128i a_r_m7_m7 __attribute__ ((aligned(16)));
+__m128i a_r_m7_m5 __attribute__ ((aligned(16)));
+__m128i a_r_m7_m3 __attribute__ ((aligned(16)));
+__m128i a_r_m7_m1 __attribute__ ((aligned(16)));
+__m128i a_r_m7_p1 __attribute__ ((aligned(16)));
+__m128i a_r_m7_p3 __attribute__ ((aligned(16)));
+__m128i a_r_m7_p5 __attribute__ ((aligned(16)));
+__m128i a_r_m7_p7 __attribute__ ((aligned(16)));
+__m128i a_r_m5_m7 __attribute__ ((aligned(16)));
+__m128i a_r_m5_m5 __attribute__ ((aligned(16)));
+__m128i a_r_m5_m3 __attribute__ ((aligned(16)));
+__m128i a_r_m5_m1 __attribute__ ((aligned(16)));
+__m128i a_r_m5_p1 __attribute__ ((aligned(16)));
+__m128i a_r_m5_p3 __attribute__ ((aligned(16)));
+__m128i a_r_m5_p5 __attribute__ ((aligned(16)));
+__m128i a_r_m5_p7 __attribute__ ((aligned(16)));
+__m128i a_r_m3_m7 __attribute__ ((aligned(16)));
+__m128i a_r_m3_m5 __attribute__ ((aligned(16)));
+__m128i a_r_m3_m3 __attribute__ ((aligned(16)));
+__m128i a_r_m3_m1 __attribute__ ((aligned(16)));
+__m128i a_r_m3_p1 __attribute__ ((aligned(16)));
+__m128i a_r_m3_p3 __attribute__ ((aligned(16)));
+__m128i a_r_m3_p5 __attribute__ ((aligned(16)));
+__m128i a_r_m3_p7 __attribute__ ((aligned(16)));
+__m128i a_r_m1_m7 __attribute__ ((aligned(16)));
+__m128i a_r_m1_m5 __attribute__ ((aligned(16)));
+__m128i a_r_m1_m3 __attribute__ ((aligned(16)));
+__m128i a_r_m1_m1 __attribute__ ((aligned(16)));
+__m128i a_r_m1_p1 __attribute__ ((aligned(16)));
+__m128i a_r_m1_p3 __attribute__ ((aligned(16)));
+__m128i a_r_m1_p5 __attribute__ ((aligned(16)));
+__m128i a_r_m1_p7 __attribute__ ((aligned(16)));
+__m128i a_r_p1_m7 __attribute__ ((aligned(16)));
+__m128i a_r_p1_m5 __attribute__ ((aligned(16)));
+__m128i a_r_p1_m3 __attribute__ ((aligned(16)));
+__m128i a_r_p1_m1 __attribute__ ((aligned(16)));
+__m128i a_r_p1_p1 __attribute__ ((aligned(16)));
+__m128i a_r_p1_p3 __attribute__ ((aligned(16)));
+__m128i a_r_p1_p5 __attribute__ ((aligned(16)));
+__m128i a_r_p1_p7 __attribute__ ((aligned(16)));
+__m128i a_r_p3_m7 __attribute__ ((aligned(16)));
+__m128i a_r_p3_m5 __attribute__ ((aligned(16)));
+__m128i a_r_p3_m3 __attribute__ ((aligned(16)));
+__m128i a_r_p3_m1 __attribute__ ((aligned(16)));
+__m128i a_r_p3_p1 __attribute__ ((aligned(16)));
+__m128i a_r_p3_p3 __attribute__ ((aligned(16)));
+__m128i a_r_p3_p5 __attribute__ ((aligned(16)));
+__m128i a_r_p3_p7 __attribute__ ((aligned(16)));
+__m128i a_r_p5_m7 __attribute__ ((aligned(16)));
+__m128i a_r_p5_m5 __attribute__ ((aligned(16)));
+__m128i a_r_p5_m3 __attribute__ ((aligned(16)));
+__m128i a_r_p5_m1 __attribute__ ((aligned(16)));
+__m128i a_r_p5_p1 __attribute__ ((aligned(16)));
+__m128i a_r_p5_p3 __attribute__ ((aligned(16)));
+__m128i a_r_p5_p5 __attribute__ ((aligned(16)));
+__m128i a_r_p5_p7 __attribute__ ((aligned(16)));
+__m128i a_r_p7_m7 __attribute__ ((aligned(16)));
+__m128i a_r_p7_m5 __attribute__ ((aligned(16)));
+__m128i a_r_p7_m3 __attribute__ ((aligned(16)));
+__m128i a_r_p7_m1 __attribute__ ((aligned(16)));
+__m128i a_r_p7_p1 __attribute__ ((aligned(16)));
+__m128i a_r_p7_p3 __attribute__ ((aligned(16)));
+__m128i a_r_p7_p5 __attribute__ ((aligned(16)));
+__m128i a_r_p7_p7 __attribute__ ((aligned(16)));
+
+__m128i a_i_m7_m7 __attribute__ ((aligned(16)));
+__m128i a_i_m7_m5 __attribute__ ((aligned(16)));
+__m128i a_i_m7_m3 __attribute__ ((aligned(16)));
+__m128i a_i_m7_m1 __attribute__ ((aligned(16)));
+__m128i a_i_m7_p1 __attribute__ ((aligned(16)));
+__m128i a_i_m7_p3 __attribute__ ((aligned(16)));
+__m128i a_i_m7_p5 __attribute__ ((aligned(16)));
+__m128i a_i_m7_p7 __attribute__ ((aligned(16)));
+__m128i a_i_m5_m7 __attribute__ ((aligned(16)));
+__m128i a_i_m5_m5 __attribute__ ((aligned(16)));
+__m128i a_i_m5_m3 __attribute__ ((aligned(16)));
+__m128i a_i_m5_m1 __attribute__ ((aligned(16)));
+__m128i a_i_m5_p1 __attribute__ ((aligned(16)));
+__m128i a_i_m5_p3 __attribute__ ((aligned(16)));
+__m128i a_i_m5_p5 __attribute__ ((aligned(16)));
+__m128i a_i_m5_p7 __attribute__ ((aligned(16)));
+__m128i a_i_m3_m7 __attribute__ ((aligned(16)));
+__m128i a_i_m3_m5 __attribute__ ((aligned(16)));
+__m128i a_i_m3_m3 __attribute__ ((aligned(16)));
+__m128i a_i_m3_m1 __attribute__ ((aligned(16)));
+__m128i a_i_m3_p1 __attribute__ ((aligned(16)));
+__m128i a_i_m3_p3 __attribute__ ((aligned(16)));
+__m128i a_i_m3_p5 __attribute__ ((aligned(16)));
+__m128i a_i_m3_p7 __attribute__ ((aligned(16)));
+__m128i a_i_m1_m7 __attribute__ ((aligned(16)));
+__m128i a_i_m1_m5 __attribute__ ((aligned(16)));
+__m128i a_i_m1_m3 __attribute__ ((aligned(16)));
+__m128i a_i_m1_m1 __attribute__ ((aligned(16)));
+__m128i a_i_m1_p1 __attribute__ ((aligned(16)));
+__m128i a_i_m1_p3 __attribute__ ((aligned(16)));
+__m128i a_i_m1_p5 __attribute__ ((aligned(16)));
+__m128i a_i_m1_p7 __attribute__ ((aligned(16)));
+__m128i a_i_p1_m7 __attribute__ ((aligned(16)));
+__m128i a_i_p1_m5 __attribute__ ((aligned(16)));
+__m128i a_i_p1_m3 __attribute__ ((aligned(16)));
+__m128i a_i_p1_m1 __attribute__ ((aligned(16)));
+__m128i a_i_p1_p1 __attribute__ ((aligned(16)));
+__m128i a_i_p1_p3 __attribute__ ((aligned(16)));
+__m128i a_i_p1_p5 __attribute__ ((aligned(16)));
+__m128i a_i_p1_p7 __attribute__ ((aligned(16)));
+__m128i a_i_p3_m7 __attribute__ ((aligned(16)));
+__m128i a_i_p3_m5 __attribute__ ((aligned(16)));
+__m128i a_i_p3_m3 __attribute__ ((aligned(16)));
+__m128i a_i_p3_m1 __attribute__ ((aligned(16)));
+__m128i a_i_p3_p1 __attribute__ ((aligned(16)));
+__m128i a_i_p3_p3 __attribute__ ((aligned(16)));
+__m128i a_i_p3_p5 __attribute__ ((aligned(16)));
+__m128i a_i_p3_p7 __attribute__ ((aligned(16)));
+__m128i a_i_p5_m7 __attribute__ ((aligned(16)));
+__m128i a_i_p5_m5 __attribute__ ((aligned(16)));
+__m128i a_i_p5_m3 __attribute__ ((aligned(16)));
+__m128i a_i_p5_m1 __attribute__ ((aligned(16)));
+__m128i a_i_p5_p1 __attribute__ ((aligned(16)));
+__m128i a_i_p5_p3 __attribute__ ((aligned(16)));
+__m128i a_i_p5_p5 __attribute__ ((aligned(16)));
+__m128i a_i_p5_p7 __attribute__ ((aligned(16)));
+__m128i a_i_p7_m7 __attribute__ ((aligned(16)));
+__m128i a_i_p7_m5 __attribute__ ((aligned(16)));
+__m128i a_i_p7_m3 __attribute__ ((aligned(16)));
+__m128i a_i_p7_m1 __attribute__ ((aligned(16)));
+__m128i a_i_p7_p1 __attribute__ ((aligned(16)));
+__m128i a_i_p7_p3 __attribute__ ((aligned(16)));
+__m128i a_i_p7_p5 __attribute__ ((aligned(16)));
+__m128i a_i_p7_p7 __attribute__ ((aligned(16)));
+
+__m128i psi_a_m7_m7 __attribute__ ((aligned(16)));
+__m128i psi_a_m7_m5 __attribute__ ((aligned(16)));
+__m128i psi_a_m7_m3 __attribute__ ((aligned(16)));
+__m128i psi_a_m7_m1 __attribute__ ((aligned(16)));
+__m128i psi_a_m7_p1 __attribute__ ((aligned(16)));
+__m128i psi_a_m7_p3 __attribute__ ((aligned(16)));
+__m128i psi_a_m7_p5 __attribute__ ((aligned(16)));
+__m128i psi_a_m7_p7 __attribute__ ((aligned(16)));
+__m128i psi_a_m5_m7 __attribute__ ((aligned(16)));
+__m128i psi_a_m5_m5 __attribute__ ((aligned(16)));
+__m128i psi_a_m5_m3 __attribute__ ((aligned(16)));
+__m128i psi_a_m5_m1 __attribute__ ((aligned(16)));
+__m128i psi_a_m5_p1 __attribute__ ((aligned(16)));
+__m128i psi_a_m5_p3 __attribute__ ((aligned(16)));
+__m128i psi_a_m5_p5 __attribute__ ((aligned(16)));
+__m128i psi_a_m5_p7 __attribute__ ((aligned(16)));
+__m128i psi_a_m3_m7 __attribute__ ((aligned(16)));
+__m128i psi_a_m3_m5 __attribute__ ((aligned(16)));
+__m128i psi_a_m3_m3 __attribute__ ((aligned(16)));
+__m128i psi_a_m3_m1 __attribute__ ((aligned(16)));
+__m128i psi_a_m3_p1 __attribute__ ((aligned(16)));
+__m128i psi_a_m3_p3 __attribute__ ((aligned(16)));
+__m128i psi_a_m3_p5 __attribute__ ((aligned(16)));
+__m128i psi_a_m3_p7 __attribute__ ((aligned(16)));
+__m128i psi_a_m1_m7 __attribute__ ((aligned(16)));
+__m128i psi_a_m1_m5 __attribute__ ((aligned(16)));
+__m128i psi_a_m1_m3 __attribute__ ((aligned(16)));
+__m128i psi_a_m1_m1 __attribute__ ((aligned(16)));
+__m128i psi_a_m1_p1 __attribute__ ((aligned(16)));
+__m128i psi_a_m1_p3 __attribute__ ((aligned(16)));
+__m128i psi_a_m1_p5 __attribute__ ((aligned(16)));
+__m128i psi_a_m1_p7 __attribute__ ((aligned(16)));
+__m128i psi_a_p1_m7 __attribute__ ((aligned(16)));
+__m128i psi_a_p1_m5 __attribute__ ((aligned(16)));
+__m128i psi_a_p1_m3 __attribute__ ((aligned(16)));
+__m128i psi_a_p1_m1 __attribute__ ((aligned(16)));
+__m128i psi_a_p1_p1 __attribute__ ((aligned(16)));
+__m128i psi_a_p1_p3 __attribute__ ((aligned(16)));
+__m128i psi_a_p1_p5 __attribute__ ((aligned(16)));
+__m128i psi_a_p1_p7 __attribute__ ((aligned(16)));
+__m128i psi_a_p3_m7 __attribute__ ((aligned(16)));
+__m128i psi_a_p3_m5 __attribute__ ((aligned(16)));
+__m128i psi_a_p3_m3 __attribute__ ((aligned(16)));
+__m128i psi_a_p3_m1 __attribute__ ((aligned(16)));
+__m128i psi_a_p3_p1 __attribute__ ((aligned(16)));
+__m128i psi_a_p3_p3 __attribute__ ((aligned(16)));
+__m128i psi_a_p3_p5 __attribute__ ((aligned(16)));
+__m128i psi_a_p3_p7 __attribute__ ((aligned(16)));
+__m128i psi_a_p5_m7 __attribute__ ((aligned(16)));
+__m128i psi_a_p5_m5 __attribute__ ((aligned(16)));
+__m128i psi_a_p5_m3 __attribute__ ((aligned(16)));
+__m128i psi_a_p5_m1 __attribute__ ((aligned(16)));
+__m128i psi_a_p5_p1 __attribute__ ((aligned(16)));
+__m128i psi_a_p5_p3 __attribute__ ((aligned(16)));
+__m128i psi_a_p5_p5 __attribute__ ((aligned(16)));
+__m128i psi_a_p5_p7 __attribute__ ((aligned(16)));
+__m128i psi_a_p7_m7 __attribute__ ((aligned(16)));
+__m128i psi_a_p7_m5 __attribute__ ((aligned(16)));
+__m128i psi_a_p7_m3 __attribute__ ((aligned(16)));
+__m128i psi_a_p7_m1 __attribute__ ((aligned(16)));
+__m128i psi_a_p7_p1 __attribute__ ((aligned(16)));
+__m128i psi_a_p7_p3 __attribute__ ((aligned(16)));
+__m128i psi_a_p7_p5 __attribute__ ((aligned(16)));
+__m128i psi_a_p7_p7 __attribute__ ((aligned(16)));
+
+__m128i a_sq_m7_m7 __attribute__ ((aligned(16)));
+__m128i a_sq_m7_m5 __attribute__ ((aligned(16)));
+__m128i a_sq_m7_m3 __attribute__ ((aligned(16)));
+__m128i a_sq_m7_m1 __attribute__ ((aligned(16)));
+__m128i a_sq_m7_p1 __attribute__ ((aligned(16)));
+__m128i a_sq_m7_p3 __attribute__ ((aligned(16)));
+__m128i a_sq_m7_p5 __attribute__ ((aligned(16)));
+__m128i a_sq_m7_p7 __attribute__ ((aligned(16)));
+__m128i a_sq_m5_m7 __attribute__ ((aligned(16)));
+__m128i a_sq_m5_m5 __attribute__ ((aligned(16)));
+__m128i a_sq_m5_m3 __attribute__ ((aligned(16)));
+__m128i a_sq_m5_m1 __attribute__ ((aligned(16)));
+__m128i a_sq_m5_p1 __attribute__ ((aligned(16)));
+__m128i a_sq_m5_p3 __attribute__ ((aligned(16)));
+__m128i a_sq_m5_p5 __attribute__ ((aligned(16)));
+__m128i a_sq_m5_p7 __attribute__ ((aligned(16)));
+__m128i a_sq_m3_m7 __attribute__ ((aligned(16)));
+__m128i a_sq_m3_m5 __attribute__ ((aligned(16)));
+__m128i a_sq_m3_m3 __attribute__ ((aligned(16)));
+__m128i a_sq_m3_m1 __attribute__ ((aligned(16)));
+__m128i a_sq_m3_p1 __attribute__ ((aligned(16)));
+__m128i a_sq_m3_p3 __attribute__ ((aligned(16)));
+__m128i a_sq_m3_p5 __attribute__ ((aligned(16)));
+__m128i a_sq_m3_p7 __attribute__ ((aligned(16)));
+__m128i a_sq_m1_m7 __attribute__ ((aligned(16)));
+__m128i a_sq_m1_m5 __attribute__ ((aligned(16)));
+__m128i a_sq_m1_m3 __attribute__ ((aligned(16)));
+__m128i a_sq_m1_m1 __attribute__ ((aligned(16)));
+__m128i a_sq_m1_p1 __attribute__ ((aligned(16)));
+__m128i a_sq_m1_p3 __attribute__ ((aligned(16)));
+__m128i a_sq_m1_p5 __attribute__ ((aligned(16)));
+__m128i a_sq_m1_p7 __attribute__ ((aligned(16)));
+__m128i a_sq_p1_m7 __attribute__ ((aligned(16)));
+__m128i a_sq_p1_m5 __attribute__ ((aligned(16)));
+__m128i a_sq_p1_m3 __attribute__ ((aligned(16)));
+__m128i a_sq_p1_m1 __attribute__ ((aligned(16)));
+__m128i a_sq_p1_p1 __attribute__ ((aligned(16)));
+__m128i a_sq_p1_p3 __attribute__ ((aligned(16)));
+__m128i a_sq_p1_p5 __attribute__ ((aligned(16)));
+__m128i a_sq_p1_p7 __attribute__ ((aligned(16)));
+__m128i a_sq_p3_m7 __attribute__ ((aligned(16)));
+__m128i a_sq_p3_m5 __attribute__ ((aligned(16)));
+__m128i a_sq_p3_m3 __attribute__ ((aligned(16)));
+__m128i a_sq_p3_m1 __attribute__ ((aligned(16)));
+__m128i a_sq_p3_p1 __attribute__ ((aligned(16)));
+__m128i a_sq_p3_p3 __attribute__ ((aligned(16)));
+__m128i a_sq_p3_p5 __attribute__ ((aligned(16)));
+__m128i a_sq_p3_p7 __attribute__ ((aligned(16)));
+__m128i a_sq_p5_m7 __attribute__ ((aligned(16)));
+__m128i a_sq_p5_m5 __attribute__ ((aligned(16)));
+__m128i a_sq_p5_m3 __attribute__ ((aligned(16)));
+__m128i a_sq_p5_m1 __attribute__ ((aligned(16)));
+__m128i a_sq_p5_p1 __attribute__ ((aligned(16)));
+__m128i a_sq_p5_p3 __attribute__ ((aligned(16)));
+__m128i a_sq_p5_p5 __attribute__ ((aligned(16)));
+__m128i a_sq_p5_p7 __attribute__ ((aligned(16)));
+__m128i a_sq_p7_m7 __attribute__ ((aligned(16)));
+__m128i a_sq_p7_m5 __attribute__ ((aligned(16)));
+__m128i a_sq_p7_m3 __attribute__ ((aligned(16)));
+__m128i a_sq_p7_m1 __attribute__ ((aligned(16)));
+__m128i a_sq_p7_p1 __attribute__ ((aligned(16)));
+__m128i a_sq_p7_p3 __attribute__ ((aligned(16)));
+__m128i a_sq_p7_p5 __attribute__ ((aligned(16)));
+__m128i a_sq_p7_p7 __attribute__ ((aligned(16)));
+
+__m128i bit_met_m7_m7 __attribute__ ((aligned(16)));
+__m128i bit_met_m7_m5 __attribute__ ((aligned(16)));
+__m128i bit_met_m7_m3 __attribute__ ((aligned(16)));
+__m128i bit_met_m7_m1 __attribute__ ((aligned(16)));
+__m128i bit_met_m7_p1 __attribute__ ((aligned(16)));
+__m128i bit_met_m7_p3 __attribute__ ((aligned(16)));
+__m128i bit_met_m7_p5 __attribute__ ((aligned(16)));
+__m128i bit_met_m7_p7 __attribute__ ((aligned(16)));
+__m128i bit_met_m5_m7 __attribute__ ((aligned(16)));
+__m128i bit_met_m5_m5 __attribute__ ((aligned(16)));
+__m128i bit_met_m5_m3 __attribute__ ((aligned(16)));
+__m128i bit_met_m5_m1 __attribute__ ((aligned(16)));
+__m128i bit_met_m5_p1 __attribute__ ((aligned(16)));
+__m128i bit_met_m5_p3 __attribute__ ((aligned(16)));
+__m128i bit_met_m5_p5 __attribute__ ((aligned(16)));
+__m128i bit_met_m5_p7 __attribute__ ((aligned(16)));
+__m128i bit_met_m3_m7 __attribute__ ((aligned(16)));
+__m128i bit_met_m3_m5 __attribute__ ((aligned(16)));
+__m128i bit_met_m3_m3 __attribute__ ((aligned(16)));
+__m128i bit_met_m3_m1 __attribute__ ((aligned(16)));
+__m128i bit_met_m3_p1 __attribute__ ((aligned(16)));
+__m128i bit_met_m3_p3 __attribute__ ((aligned(16)));
+__m128i bit_met_m3_p5 __attribute__ ((aligned(16)));
+__m128i bit_met_m3_p7 __attribute__ ((aligned(16)));
+__m128i bit_met_m1_m7 __attribute__ ((aligned(16)));
+__m128i bit_met_m1_m5 __attribute__ ((aligned(16)));
+__m128i bit_met_m1_m3 __attribute__ ((aligned(16)));
+__m128i bit_met_m1_m1 __attribute__ ((aligned(16)));
+__m128i bit_met_m1_p1 __attribute__ ((aligned(16)));
+__m128i bit_met_m1_p3 __attribute__ ((aligned(16)));
+__m128i bit_met_m1_p5 __attribute__ ((aligned(16)));
+__m128i bit_met_m1_p7 __attribute__ ((aligned(16)));
+__m128i bit_met_p1_m7 __attribute__ ((aligned(16)));
+__m128i bit_met_p1_m5 __attribute__ ((aligned(16)));
+__m128i bit_met_p1_m3 __attribute__ ((aligned(16)));
+__m128i bit_met_p1_m1 __attribute__ ((aligned(16)));
+__m128i bit_met_p1_p1 __attribute__ ((aligned(16)));
+__m128i bit_met_p1_p3 __attribute__ ((aligned(16)));
+__m128i bit_met_p1_p5 __attribute__ ((aligned(16)));
+__m128i bit_met_p1_p7 __attribute__ ((aligned(16)));
+__m128i bit_met_p3_m7 __attribute__ ((aligned(16)));
+__m128i bit_met_p3_m5 __attribute__ ((aligned(16)));
+__m128i bit_met_p3_m3 __attribute__ ((aligned(16)));
+__m128i bit_met_p3_m1 __attribute__ ((aligned(16)));
+__m128i bit_met_p3_p1 __attribute__ ((aligned(16)));
+__m128i bit_met_p3_p3 __attribute__ ((aligned(16)));
+__m128i bit_met_p3_p5 __attribute__ ((aligned(16)));
+__m128i bit_met_p3_p7 __attribute__ ((aligned(16)));
+__m128i bit_met_p5_m7 __attribute__ ((aligned(16)));
+__m128i bit_met_p5_m5 __attribute__ ((aligned(16)));
+__m128i bit_met_p5_m3 __attribute__ ((aligned(16)));
+__m128i bit_met_p5_m1 __attribute__ ((aligned(16)));
+__m128i bit_met_p5_p1 __attribute__ ((aligned(16)));
+__m128i bit_met_p5_p3 __attribute__ ((aligned(16)));
+__m128i bit_met_p5_p5 __attribute__ ((aligned(16)));
+__m128i bit_met_p5_p7 __attribute__ ((aligned(16)));
+__m128i bit_met_p7_m7 __attribute__ ((aligned(16)));
+__m128i bit_met_p7_m5 __attribute__ ((aligned(16)));
+__m128i bit_met_p7_m3 __attribute__ ((aligned(16)));
+__m128i bit_met_p7_m1 __attribute__ ((aligned(16)));
+__m128i bit_met_p7_p1 __attribute__ ((aligned(16)));
+__m128i bit_met_p7_p3 __attribute__ ((aligned(16)));
+__m128i bit_met_p7_p5 __attribute__ ((aligned(16)));
+__m128i bit_met_p7_p7 __attribute__ ((aligned(16)));
+
+__m128i  y0_p_1_1 __attribute__ ((aligned(16)));
+__m128i  y0_p_1_3 __attribute__ ((aligned(16)));
+__m128i  y0_p_1_5 __attribute__ ((aligned(16)));
+__m128i  y0_p_1_7 __attribute__ ((aligned(16)));
+__m128i  y0_p_3_1 __attribute__ ((aligned(16)));
+__m128i  y0_p_3_3 __attribute__ ((aligned(16)));
+__m128i  y0_p_3_5 __attribute__ ((aligned(16)));
+__m128i  y0_p_3_7 __attribute__ ((aligned(16)));
+__m128i  y0_p_5_1 __attribute__ ((aligned(16)));
+__m128i  y0_p_5_3 __attribute__ ((aligned(16)));
+__m128i  y0_p_5_5 __attribute__ ((aligned(16)));
+__m128i  y0_p_5_7 __attribute__ ((aligned(16)));
+__m128i  y0_p_7_1 __attribute__ ((aligned(16)));
+__m128i  y0_p_7_3 __attribute__ ((aligned(16)));
+__m128i  y0_p_7_5 __attribute__ ((aligned(16)));
+__m128i  y0_p_7_7 __attribute__ ((aligned(16)));
+__m128i  y0_m_1_1 __attribute__ ((aligned(16)));
+__m128i  y0_m_1_3 __attribute__ ((aligned(16)));
+__m128i  y0_m_1_5 __attribute__ ((aligned(16)));
+__m128i  y0_m_1_7 __attribute__ ((aligned(16)));
+__m128i  y0_m_3_1 __attribute__ ((aligned(16)));
+__m128i  y0_m_3_3 __attribute__ ((aligned(16)));
+__m128i  y0_m_3_5 __attribute__ ((aligned(16)));
+__m128i  y0_m_3_7 __attribute__ ((aligned(16)));
+__m128i  y0_m_5_1 __attribute__ ((aligned(16)));
+__m128i  y0_m_5_3 __attribute__ ((aligned(16)));
+__m128i  y0_m_5_5 __attribute__ ((aligned(16)));
+__m128i  y0_m_5_7 __attribute__ ((aligned(16)));
+__m128i  y0_m_7_1 __attribute__ ((aligned(16)));
+__m128i  y0_m_7_3 __attribute__ ((aligned(16)));
+__m128i  y0_m_7_5 __attribute__ ((aligned(16)));
+__m128i  y0_m_7_7 __attribute__ ((aligned(16)));
+
+__m128i  xmm0 __attribute__ ((aligned(16)));
+__m128i  xmm1 __attribute__ ((aligned(16)));
+__m128i  xmm2 __attribute__ ((aligned(16)));
+__m128i  xmm3 __attribute__ ((aligned(16)));
+__m128i  xmm4 __attribute__ ((aligned(16)));
+__m128i  xmm5 __attribute__ ((aligned(16)));
+__m128i  xmm6 __attribute__ ((aligned(16)));
+__m128i  xmm7 __attribute__ ((aligned(16)));
+__m128i  xmm8 __attribute__ ((aligned(16)));
+
+__m128i  y0r __attribute__ ((aligned(16)));
+__m128i  y0i __attribute__ ((aligned(16)));
+__m128i  y1r __attribute__ ((aligned(16)));
+__m128i  y1i __attribute__ ((aligned(16)));
+__m128i  y2r __attribute__ ((aligned(16)));
+__m128i  y2i __attribute__ ((aligned(16)));
+
+__m128i  logmax_num_re0 __attribute__ ((aligned(16)));
+__m128i  logmax_num_im0 __attribute__ ((aligned(16)));
+__m128i  logmax_den_re0 __attribute__ ((aligned(16)));
+__m128i  logmax_den_im0 __attribute__ ((aligned(16)));
+__m128i  logmax_num_re1 __attribute__ ((aligned(16)));
+__m128i  logmax_num_im1 __attribute__ ((aligned(16)));
+__m128i  logmax_den_re1 __attribute__ ((aligned(16)));
+__m128i  logmax_den_im1 __attribute__ ((aligned(16)));
+
+__m128i tmp_result  __attribute__ ((aligned(16)));
+__m128i tmp_result2 __attribute__ ((aligned(16)));
+__m128i tmp_result3 __attribute__ ((aligned(16)));
+__m128i tmp_result4 __attribute__ ((aligned(16)));
 
-NOCYGWIN_STATIC short zero[8] __attribute__ ((aligned(16))) = {0,0,0,0,0,0,0,0};
-NOCYGWIN_STATIC short ones[8] __attribute__ ((aligned(16))) = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};
-NOCYGWIN_STATIC __m128i rho_rpi __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_1_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_1_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_1_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_1_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_3_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_3_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_3_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_3_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_5_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_5_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_5_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_5_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_7_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_7_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_7_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rpi_7_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_1_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_1_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_1_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_1_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_3_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_3_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_3_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_3_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_5_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_5_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_5_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_5_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_7_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_7_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_7_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i rho_rmi_7_7 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i psi_r_m7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m7_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_m1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_r_p7_p7 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i psi_i_m7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m7_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_m1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_i_p7_p7 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i a_r_m7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m7_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_m1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_r_p7_p7 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i a_i_m7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m7_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_m1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_i_p7_p7 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i psi_a_m7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m7_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_m1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i psi_a_p7_p7 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i a_sq_m7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m7_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_m1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i a_sq_p7_p7 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i bit_met_m7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m7_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_m1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p1_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p1_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p1_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p1_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p1_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p1_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p1_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p1_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p3_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p3_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p3_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p3_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p3_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p3_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p3_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p3_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p5_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p5_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p5_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p5_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p5_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p5_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p5_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p5_p7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p7_m7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p7_m5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p7_m3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p7_m1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p7_p1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p7_p3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p7_p5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i bit_met_p7_p7 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i  y0_p_1_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_1_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_1_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_1_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_3_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_3_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_3_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_3_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_5_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_5_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_5_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_5_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_7_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_7_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_7_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_p_7_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_1_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_1_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_1_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_1_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_3_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_3_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_3_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_3_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_5_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_5_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_5_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_5_7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_7_1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_7_3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_7_5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0_m_7_7 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i  xmm0 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  xmm1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  xmm2 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  xmm3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  xmm4 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  xmm5 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  xmm6 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  xmm7 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  xmm8 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i  y0r __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0i __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y1r __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y1i __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y2r __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y2i __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i  logmax_num_re0 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  logmax_num_im0 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  logmax_den_re0 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  logmax_den_im0 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  logmax_num_re1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  logmax_num_im1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  logmax_den_re1 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  logmax_den_im1 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i tmp_result  __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i tmp_result2 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i tmp_result3 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i tmp_result4 __attribute__ ((aligned(16)));
 
 //==============================================================================================
 // Auxiliary Makros
@@ -625,6 +621,10 @@ NOCYGWIN_STATIC __m128i tmp_result4 __attribute__ ((aligned(16)));
 // calculates a_sq = int_ch_mag*(a_r^2 + a_i^2)*scale_factor for 64-QAM
 #define square_a_64qam_epi16(a_r,a_i,int_ch_mag,scale_factor,a_sq)  tmp_result = _mm_mulhi_epi16(a_r,a_r); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result = _mm_mulhi_epi16(tmp_result,scale_factor); tmp_result = _mm_slli_epi16(tmp_result,3); tmp_result = _mm_mulhi_epi16(tmp_result,int_ch_mag); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result2 = _mm_mulhi_epi16(a_i,a_i); tmp_result2 = _mm_slli_epi16(tmp_result2,1); tmp_result2 = _mm_mulhi_epi16(tmp_result2,scale_factor); tmp_result2 = _mm_slli_epi16(tmp_result2,3); tmp_result2 = _mm_mulhi_epi16(tmp_result2,int_ch_mag); tmp_result2 = _mm_slli_epi16(tmp_result2,1); a_sq = _mm_adds_epi16(tmp_result,tmp_result2);
 
+#elif defined(__arm__)
+
+#endif
+
 //==============================================================================================
 // SINGLE-STREAM
 //==============================================================================================
@@ -634,16 +634,16 @@ NOCYGWIN_STATIC __m128i tmp_result4 __attribute__ ((aligned(16)));
 //----------------------------------------------------------------------------------------------
 
 int dlsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                   int **rxdataF_comp,
-                   short *dlsch_llr,
-                   unsigned char symbol,
+                   int32_t **rxdataF_comp,
+                   int16_t *dlsch_llr,
+                   uint8_t symbol,
                    uint8_t first_symbol_flag,
                    uint16_t nb_rb,
                    uint16_t pbch_pss_sss_adjust,
-                   short **llr32p)
+                   int16_t **llr32p)
 {
 
-  uint32_t *rxF = (uint32_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  uint32_t *rxF = (uint32_t*)&rxdataF_comp[0][((int32_t)symbol*frame_parms->N_RB_DL*12)];
   uint32_t *llr32;
   int i,len;
   uint8_t symbol_mod = (symbol >= (7-frame_parms->Ncp))? (symbol-(7-frame_parms->Ncp)) : symbol;
@@ -669,17 +669,14 @@ int dlsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
     len = (nb_rb*12);// - pbch_pss_sss_adjust;
   }
 
-  //    printf("dlsch_qpsk_llr: symbol %d,nb_rb %d, len %d,pbch_pss_sss_adjust %d\n",symbol,nb_rb,len,pbch_pss_sss_adjust);
+//  printf("dlsch_qpsk_llr: symbol %d,nb_rb %d, len %d,pbch_pss_sss_adjust %d\n",symbol,nb_rb,len,pbch_pss_sss_adjust);
   for (i=0; i<len; i++) {
     *llr32 = *rxF;
     rxF++;
     llr32++;
   }
 
-  *llr32p = (short *)llr32;
-
-  _mm_empty();
-  _m_empty();
+  *llr32p = (int16_t *)llr32;
 
   return(0);
 }
@@ -689,34 +686,54 @@ int dlsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
 //----------------------------------------------------------------------------------------------
 
 void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                     int **rxdataF_comp,
-                     short *dlsch_llr,
-                     int **dl_ch_mag,
-                     unsigned char symbol,
+                     int32_t **rxdataF_comp,
+                     int16_t *dlsch_llr,
+                     int32_t **dl_ch_mag,
+                     uint8_t symbol,
                      uint8_t first_symbol_flag,
-                     unsigned short nb_rb,
+                     uint16_t nb_rb,
                      uint16_t pbch_pss_sss_adjust,
                      int16_t **llr32p)
 {
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
   __m128i *ch_mag;
   __m128i llr128[2];
+  uint32_t *llr32;
+#elif defined(__arm__)
+  int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16x8_t *ch_mag;
+  int16x8_t xmm0;
+  int16_t *llr16;
+#endif
+
+
   int i,len;
   unsigned char symbol_mod,len_mod4=0;
-  uint32_t *llr32;
 
 
+#if defined(__x86_64__) || defined(__i386__)
   if (first_symbol_flag==1) {
     llr32 = (uint32_t*)dlsch_llr;
   } else {
     llr32 = (uint32_t*)*llr32p;
   }
+#elif defined(__arm__)
+  if (first_symbol_flag==1) {
+    llr16 = (int16_t*)dlsch_llr;
+  } else {
+    llr16 = (int16_t*)*llr32p;
+  }
+#endif
 
   symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
 
+#if defined(__x86_64__) || defined(__i386__)
   ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
-
+#elif defined(__arm__)
+  ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+#endif
   if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
     if (frame_parms->mode1_flag==0)
       len = nb_rb*8 - (2*pbch_pss_sss_adjust/3);
@@ -738,6 +755,7 @@ void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 
   for (i=0; i<len; i++) {
 
+#if defined(__x86_64__) || defined(__i386)
     xmm0 = _mm_abs_epi16(rxF[i]);
     xmm0 = _mm_subs_epi16(ch_mag[i],xmm0);
 
@@ -753,10 +771,36 @@ void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
     llr32[6] = _mm_extract_epi32(llr128[1],2); //((uint32_t *)&llr128[1])[2];
     llr32[7] = _mm_extract_epi32(llr128[1],3); //((uint32_t *)&llr128[1])[3];
     llr32+=8;
+#elif defined(__arm__)
+    xmm0 = vabsq_s16(rxF[i]);
+    xmm0 = vqsubq_s16(ch_mag[i],xmm0);
+    // lambda_1=y_R, lambda_2=|y_R|-|h|^2, lamda_3=y_I, lambda_4=|y_I|-|h|^2
+
+    llr16[0] = vgetq_lane_s16(rxF[i],0);
+    llr16[1] = vgetq_lane_s16(rxF[i],1);
+    llr16[2] = vgetq_lane_s16(xmm0,0);
+    llr16[3] = vgetq_lane_s16(xmm0,1);
+    llr16[4] = vgetq_lane_s16(rxF[i],2);
+    llr16[5] = vgetq_lane_s16(rxF[i],3);
+    llr16[6] = vgetq_lane_s16(xmm0,2);
+    llr16[7] = vgetq_lane_s16(xmm0,3);
+    llr16[8] = vgetq_lane_s16(rxF[i],4);
+    llr16[9] = vgetq_lane_s16(rxF[i],5);
+    llr16[10] = vgetq_lane_s16(xmm0,4);
+    llr16[11] = vgetq_lane_s16(xmm0,5);
+    llr16[12] = vgetq_lane_s16(rxF[i],6);
+    llr16[13] = vgetq_lane_s16(rxF[i],6);
+    llr16[14] = vgetq_lane_s16(xmm0,7);
+    llr16[15] = vgetq_lane_s16(xmm0,7);
+    llr16+=16;
+#endif
+
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 //----------------------------------------------------------------------------------------------
@@ -764,19 +808,23 @@ void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 //----------------------------------------------------------------------------------------------
 
 void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                     int **rxdataF_comp,
-                     short *dlsch_llr,
-                     int **dl_ch_mag,
-                     int **dl_ch_magb,
-                     unsigned char symbol,
+                     int32_t **rxdataF_comp,
+                     int16_t *dlsch_llr,
+                     int32_t **dl_ch_mag,
+                     int32_t **dl_ch_magb,
+                     uint8_t symbol,
                      uint8_t first_symbol_flag,
-                     unsigned short nb_rb,
+                     uint16_t nb_rb,
                      uint16_t pbch_pss_sss_adjust,
-                     short **llr_save)
+                     int16_t **llr_save)
 {
-
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
   __m128i *ch_mag,*ch_magb;
+#elif defined(__arm__)
+  int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16x8_t *ch_mag,*ch_magb,xmm1,xmm2;
+#endif
   int i,len,len2;
   unsigned char symbol_mod,len_mod4;
   short *llr;
@@ -789,9 +837,13 @@ void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 
   symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
 
+#if defined(__x86_64__) || defined(__i386__)
   ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
   ch_magb = (__m128i*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];
-
+#elif defined(__arm__)
+  ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+  ch_magb = (int16x8_t*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];
+#endif
   if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
     if (frame_parms->mode1_flag==0)
       len = nb_rb*8 - (2*pbch_pss_sss_adjust/3);
@@ -810,62 +862,96 @@ void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 
   for (i=0; i<len2; i++) {
 
+#if defined(__x86_64__) || defined(__i386__)
     xmm1 = _mm_abs_epi16(rxF[i]);
     xmm1 = _mm_subs_epi16(ch_mag[i],xmm1);
     xmm2 = _mm_abs_epi16(xmm1);
     xmm2 = _mm_subs_epi16(ch_magb[i],xmm2);
-
+#elif defined(__arm__)
+    xmm1 = vabsq_s16(rxF[i]);
+    xmm1 = vsubq_s16(ch_mag[i],xmm1);
+    xmm2 = vabsq_s16(xmm1);
+    xmm2 = vsubq_s16(ch_magb[i],xmm2);
+#endif
     // loop over all LLRs in quad word (24 coded bits)
     /*
-          for (j=0;j<8;j+=2) {
-              llr2[0] = ((short *)&rxF[i])[j];
-              llr2[1] = ((short *)&rxF[i])[j+1];
-              llr2[2] = ((short *)&xmm1)[j];
-              llr2[3] = ((short *)&xmm1)[j+1];
-              llr2[4] = ((short *)&xmm2)[j];
-              llr2[5] = ((short *)&xmm2)[j+1];
-
-              llr2+=6;
-          }
+      for (j=0;j<8;j+=2) {
+      llr2[0] = ((short *)&rxF[i])[j];
+      llr2[1] = ((short *)&rxF[i])[j+1];
+      llr2[2] = ((short *)&xmm1)[j];
+      llr2[3] = ((short *)&xmm1)[j+1];
+      llr2[4] = ((short *)&xmm2)[j];
+      llr2[5] = ((short *)&xmm2)[j+1];
+
+      llr2+=6;
+      }
     */
     llr2[0] = ((short *)&rxF[i])[0];
     llr2[1] = ((short *)&rxF[i])[1];
+#if defined(__x86_64__) || defined(__i386__)
     llr2[2] = _mm_extract_epi16(xmm1,0);
     llr2[3] = _mm_extract_epi16(xmm1,1);//((short *)&xmm1)[j+1];
     llr2[4] = _mm_extract_epi16(xmm2,0);//((short *)&xmm2)[j];
     llr2[5] = _mm_extract_epi16(xmm2,1);//((short *)&xmm2)[j+1];
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,0);
+    llr2[3] = vgetq_lane_s16(xmm1,1);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,0);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,1);//((short *)&xmm2)[j+1];
+#endif
 
     llr2+=6;
     llr2[0] = ((short *)&rxF[i])[2];
     llr2[1] = ((short *)&rxF[i])[3];
+#if defined(__x86_64__) || defined(__i386__)
     llr2[2] = _mm_extract_epi16(xmm1,2);
     llr2[3] = _mm_extract_epi16(xmm1,3);//((short *)&xmm1)[j+1];
     llr2[4] = _mm_extract_epi16(xmm2,2);//((short *)&xmm2)[j];
     llr2[5] = _mm_extract_epi16(xmm2,3);//((short *)&xmm2)[j+1];
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,2);
+    llr2[3] = vgetq_lane_s16(xmm1,3);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,2);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,3);//((short *)&xmm2)[j+1];
+#endif
 
     llr2+=6;
     llr2[0] = ((short *)&rxF[i])[4];
     llr2[1] = ((short *)&rxF[i])[5];
+#if defined(__x86_64__) || defined(__i386__)
     llr2[2] = _mm_extract_epi16(xmm1,4);
     llr2[3] = _mm_extract_epi16(xmm1,5);//((short *)&xmm1)[j+1];
     llr2[4] = _mm_extract_epi16(xmm2,4);//((short *)&xmm2)[j];
     llr2[5] = _mm_extract_epi16(xmm2,5);//((short *)&xmm2)[j+1];
-
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,4);
+    llr2[3] = vgetq_lane_s16(xmm1,5);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,4);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,5);//((short *)&xmm2)[j+1];
+#endif
     llr2+=6;
     llr2[0] = ((short *)&rxF[i])[6];
     llr2[1] = ((short *)&rxF[i])[7];
+#if defined(__x86_64__) || defined(__i386__)
     llr2[2] = _mm_extract_epi16(xmm1,6);
     llr2[3] = _mm_extract_epi16(xmm1,7);//((short *)&xmm1)[j+1];
     llr2[4] = _mm_extract_epi16(xmm2,6);//((short *)&xmm2)[j];
     llr2[5] = _mm_extract_epi16(xmm2,7);//((short *)&xmm2)[j+1];
-
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,6);
+    llr2[3] = vgetq_lane_s16(xmm1,7);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,6);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,7);//((short *)&xmm2)[j+1];
+#endif
     llr2+=6;
 
   }
 
   *llr_save = llr;
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 
@@ -877,20 +963,22 @@ void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 // QPSK
 //----------------------------------------------------------------------------------------------
 
-NOCYGWIN_STATIC __m128i  y0r_over2 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0i_over2 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y1r_over2 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y1i_over2 __attribute__ ((aligned(16)));
-
-NOCYGWIN_STATIC __m128i  A __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  B __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  C __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  D __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  E __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  F __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  G __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  H __attribute__ ((aligned(16)));
+#if defined(__x86_64__) || defined(__i386)
+__m128i  y0r_over2 __attribute__ ((aligned(16)));
+__m128i  y0i_over2 __attribute__ ((aligned(16)));
+__m128i  y1r_over2 __attribute__ ((aligned(16)));
+__m128i  y1i_over2 __attribute__ ((aligned(16)));
+
+__m128i  A __attribute__ ((aligned(16)));
+__m128i  B __attribute__ ((aligned(16)));
+__m128i  C __attribute__ ((aligned(16)));
+__m128i  D __attribute__ ((aligned(16)));
+__m128i  E __attribute__ ((aligned(16)));
+__m128i  F __attribute__ ((aligned(16)));
+__m128i  G __attribute__ ((aligned(16)));
+__m128i  H __attribute__ ((aligned(16)));
 
+#endif
 
 int dlsch_qpsk_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
                         int **rxdataF_comp,
@@ -948,47 +1036,53 @@ int dlsch_qpsk_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
   return(0);
 }
 
-NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_8 __attribute__((aligned(16)));
+//__m128i ONE_OVER_SQRT_8 __attribute__((aligned(16)));
 
 void qpsk_qpsk(short *stream0_in,
                short *stream1_in,
                short *stream0_out,
                short *rho01,
                int length
-              )
+	       )
 {
 
   /*
-  This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers.
-
-  Parameters:
-  stream0_in = Matched filter output y0' = (h0*g0)*y0
-  stream1_in = Matched filter output y1' = (h0*g1)*y0
-  stream0_out = LLRs
-  rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0)
-  length = number of resource elements
+    This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers.
+
+    Parameters:
+    stream0_in = Matched filter output y0' = (h0*g0)*y0
+    stream1_in = Matched filter output y1' = (h0*g1)*y0
+    stream0_out = LLRs
+    rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0)
+    length = number of resource elements
   */
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rho01_128i = (__m128i *)rho01;
   __m128i *stream0_128i_in = (__m128i *)stream0_in;
   __m128i *stream1_128i_in = (__m128i *)stream1_in;
   __m128i *stream0_128i_out = (__m128i *)stream0_out;
-
-#ifdef DEBUG_LLR
-  print_shorts2("rho01_128i:\n",rho01_128i);
+  __m128i ONE_OVER_SQRT_8 = _mm_set1_epi16(23170); //round(2^16/sqrt(8))
+#elif defined(__arm__)
+  int16x8_t *rho01_128i = (int16x8_t *)rho01;
+  int16x8_t *stream0_128i_in = (int16x8_t *)stream0_in;
+  int16x8_t *stream1_128i_in = (int16x8_t *)stream1_in;
+  int16x8_t *stream0_128i_out = (int16x8_t *)stream0_out;
+  int16x8_t ONE_OVER_SQRT_8 = vdupq_n_s16(23170); //round(2^16/sqrt(8))
 #endif
 
   int i;
-  ONE_OVER_SQRT_8 = _mm_set1_epi16(23170); //round(2^16/sqrt(8))
+
 
   for (i=0; i<length>>2; i+=2) {
     // in each iteration, we take 8 complex samples
-
+#if defined(__x86_64__) || defined(__i386__)
     xmm0 = rho01_128i[i]; // 4 symbols
     xmm1 = rho01_128i[i+1];
 
     // put (rho_r + rho_i)/2sqrt2 in rho_rpi
     // put (rho_r - rho_i)/2sqrt2 in rho_rmi
+
     xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
     xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
     xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
@@ -1005,10 +1099,14 @@ void qpsk_qpsk(short *stream0_in,
     // divide by sqrt(8), no shift needed ONE_OVER_SQRT_8 = Q1.16
     rho_rpi = _mm_mulhi_epi16(rho_rpi,ONE_OVER_SQRT_8);
     rho_rmi = _mm_mulhi_epi16(rho_rmi,ONE_OVER_SQRT_8);
+#elif defined(__arm__)
 
+
+#endif
     // Compute LLR for first bit of stream 0
 
     // Compute real and imaginary parts of MF output for stream 0
+#if defined(__x86_64__) || defined(__i386__)
     xmm0 = stream0_128i_in[i];
     xmm1 = stream0_128i_in[i+1];
 
@@ -1025,8 +1123,12 @@ void qpsk_qpsk(short *stream0_in,
 
     y0r_over2  = _mm_srai_epi16(y0r,1);   // divide by 2
     y0i_over2  = _mm_srai_epi16(y0i,1);   // divide by 2
+#elif defined(__arm__)
+
 
+#endif
     // Compute real and imaginary parts of MF output for stream 1
+#if defined(__x86_64__) || defined(__i386__)
     xmm0 = stream1_128i_in[i];
     xmm1 = stream1_128i_in[i+1];
 
@@ -1116,23 +1218,28 @@ void qpsk_qpsk(short *stream0_in,
     if (i<((length>>1) - 1)) // false if only 2 REs remain
       _mm_storeu_si128(&stream0_128i_out[i+1],_mm_unpackhi_epi16(y0r,y0i));
 
+#elif defined(__x86_64__)
+
+#endif
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 int dlsch_qpsk_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                         int **rxdataF_comp,
-                         int **rxdataF_comp_i,
-                         int **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
-                         int **rho_i,
-                         short *dlsch_llr,
-                         unsigned char symbol,
-                         unsigned char first_symbol_flag,
-                         unsigned short nb_rb,
+                         int32_t **rxdataF_comp,
+                         int32_t **rxdataF_comp_i,
+                         int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
+                         int32_t **rho_i,
+                         int16_t *dlsch_llr,
+                         uint8_t symbol,
+                         uint8_t first_symbol_flag,
+                         uint16_t nb_rb,
                          uint16_t pbch_pss_sss_adjust,
-                         short **llr16p)
+                         int16_t **llr16p)
 {
 
   int16_t *rxF=(int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
@@ -1180,38 +1287,58 @@ int dlsch_qpsk_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
   return(0);
 }
 
-NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_10 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i THREE_OVER_SQRT_10 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_10_Q15 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i SQRT_10_OVER_FOUR __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_int __attribute__((aligned(16)));
-
-void qpsk_qam16(short *stream0_in,
-                short *stream1_in,
-                short *ch_mag_i,
-                short *stream0_out,
-                short *rho01,
-                int length
-               )
+/*
+#if defined(__x86_64__) || defined(__i386__)
+__m128i ONE_OVER_SQRT_2 __attribute__((aligned(16)));
+__m128i ONE_OVER_SQRT_10 __attribute__((aligned(16)));
+__m128i THREE_OVER_SQRT_10 __attribute__((aligned(16)));
+__m128i ONE_OVER_SQRT_10_Q15 __attribute__((aligned(16)));
+__m128i SQRT_10_OVER_FOUR __attribute__((aligned(16)));
+__m128i ch_mag_int;
+#endif
+*/
+void qpsk_qam16(int16_t *stream0_in,
+                int16_t *stream1_in,
+                int16_t *ch_mag_i,
+                int16_t *stream0_out,
+                int16_t *rho01,
+                int32_t length
+		)
 {
-
   /*
-  This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers.
-
-  Parameters:
-  stream0_in = Matched filter output y0' = (h0*g0)*y0
-  stream1_in = Matched filter output y1' = (h0*g1)*y0
-  stream0_out = LLRs
-  rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0)
-  length = number of resource elements
+    This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers.
+
+    Parameters:
+    stream0_in = Matched filter output y0' = (h0*g0)*y0
+    stream1_in = Matched filter output y1' = (h0*g1)*y0
+    stream0_out = LLRs
+    rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0)
+    length = number of resource elements
   */
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rho01_128i = (__m128i *)rho01;
   __m128i *stream0_128i_in = (__m128i *)stream0_in;
   __m128i *stream1_128i_in = (__m128i *)stream1_in;
   __m128i *stream0_128i_out = (__m128i *)stream0_out;
   __m128i *ch_mag_128i_i    = (__m128i *)ch_mag_i;
+  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
+  __m128i ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
+  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
+  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
+  __m128i ch_mag_int __attribute__((aligned(16)));
+#elif defined(__arm__)
+  int16x8_t *rho01_128i = (int16x8_t *)rho01;
+  int16x8_t *stream0_128i_in = (int16x8_t *)stream0_in;
+  int16x8_t *stream1_128i_in = (int16x8_t *)stream1_in;
+  int16x8_t *stream0_128i_out = (int16x8_t *)stream0_out;
+  int16x8_t *ch_mag_128i_i    = (int16x8_t *)ch_mag_i;
+  int16x8_t ONE_OVER_SQRT_2 = vdupq_n_s16(23170); // round(1/sqrt(2)*2^15)
+  int16x8_t ONE_OVER_SQRT_10_Q15 = vdupq_n_s16(10362); // round(1/sqrt(10)*2^15)
+  int16x8_t THREE_OVER_SQRT_10 = vdupq_n_s16(31086); // round(3/sqrt(10)*2^15)
+  int16x8_t SQRT_10_OVER_FOUR = vdupq_n_s16(25905); // round(sqrt(10)/4*2^15)
+  int16x8_t ch_mag_int __attribute__((aligned(16)));
+#endif
 
 #ifdef DEBUG_LLR
   print_shorts2("rho01_128i:\n",rho01_128i);
@@ -1219,14 +1346,12 @@ void qpsk_qam16(short *stream0_in,
 
   int i;
 
-  ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
-  ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
-  THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
-  SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
 
   for (i=0; i<length>>2; i+=2) {
     // in each iteration, we take 8 complex samples
 
+#if defined(__x86_64__) || defined(__i386__)
+
     xmm0 = rho01_128i[i]; // 4 symbols
     xmm1 = rho01_128i[i+1];
 
@@ -1377,23 +1502,28 @@ void qpsk_qam16(short *stream0_in,
     if (i<((length>>1) - 1)) // false if only 2 REs remain
       stream0_128i_out[i+1] = _mm_unpackhi_epi16(y0r,y0i);
 
+#elif defined(__arm__)
+
+#endif
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 int dlsch_qpsk_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                         int **rxdataF_comp,
-                         int **rxdataF_comp_i,
-                         int **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
-                         int **rho_i,
-                         short *dlsch_llr,
-                         unsigned char symbol,
-                         unsigned char first_symbol_flag,
-                         unsigned short nb_rb,
+                         int32_t **rxdataF_comp,
+                         int32_t **rxdataF_comp_i,
+                         int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
+                         int32_t **rho_i,
+                         int16_t *dlsch_llr,
+                         uint8_t symbol,
+                         uint8_t first_symbol_flag,
+                         uint16_t nb_rb,
                          uint16_t pbch_pss_sss_adjust,
-                         short **llr16p)
+                         int16_t **llr16p)
 {
 
   int16_t *rxF=(int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
@@ -1440,42 +1570,56 @@ int dlsch_qpsk_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 
   return(0);
 }
-
-NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_2_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i THREE_OVER_SQRT_2_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i FIVE_OVER_SQRT_2_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i SEVEN_OVER_SQRT_2_42 __attribute__((aligned(16)));
-
-NOCYGWIN_STATIC __m128i ch_mag_int_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i two_ch_mag_int_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i three_ch_mag_int_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i SQRT_42_OVER_FOUR __attribute__((aligned(16)));
-
+/*
+__m128i ONE_OVER_SQRT_2_42 __attribute__((aligned(16)));
+__m128i THREE_OVER_SQRT_2_42 __attribute__((aligned(16)));
+__m128i FIVE_OVER_SQRT_2_42 __attribute__((aligned(16)));
+__m128i SEVEN_OVER_SQRT_2_42 __attribute__((aligned(16)));
+
+__m128i ch_mag_int_with_sigma2 __attribute__((aligned(16)));
+__m128i two_ch_mag_int_with_sigma2 __attribute__((aligned(16)));
+__m128i three_ch_mag_int_with_sigma2 __attribute__((aligned(16)));
+__m128i SQRT_42_OVER_FOUR __attribute__((aligned(16)));
+*/
 void qpsk_qam64(short *stream0_in,
                 short *stream1_in,
                 short *ch_mag_i,
                 short *stream0_out,
                 short *rho01,
                 int length
-               )
+		)
 {
 
   /*
-  This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers.
-
-  Parameters:
-  stream0_in = Matched filter output y0' = (h0*g0)*y0
-  stream1_in = Matched filter output y1' = (h0*g1)*y0
-  stream0_out = LLRs
-  rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0)
-  length = number of resource elements
+    This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers.
+
+    Parameters:
+    stream0_in = Matched filter output y0' = (h0*g0)*y0
+    stream1_in = Matched filter output y1' = (h0*g1)*y0
+    stream0_out = LLRs
+    rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0)
+    length = number of resource elements
   */
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rho01_128i = (__m128i *)rho01;
   __m128i *stream0_128i_in = (__m128i *)stream0_in;
   __m128i *stream1_128i_in = (__m128i *)stream1_in;
   __m128i *stream0_128i_out = (__m128i *)stream0_out;
   __m128i *ch_mag_128i_i    = (__m128i *)ch_mag_i;
+  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
+  __m128i ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
+  __m128i THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
+  __m128i FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
+  __m128i SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
+  __m128i SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.1
+  __m128i ch_mag_int;
+  __m128i ch_mag_int_with_sigma2;
+  __m128i two_ch_mag_int_with_sigma2;
+  __m128i three_ch_mag_int_with_sigma2;
+#elif defined(__arm__)
+
+#endif
 
 #ifdef DEBUG_LLR
   print_shorts2("rho01_128i:\n",rho01_128i);
@@ -1483,16 +1627,12 @@ void qpsk_qam64(short *stream0_in,
 
   int i;
 
-  ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
-  ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
-  THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
-  FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
-  SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
-  SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12
 
   for (i=0; i<length>>2; i+=2) {
     // in each iteration, we take 8 complex samples
 
+#if defined(__x86_64__) || defined(__i386__)
+
     xmm0 = rho01_128i[i]; // 4 symbols
     xmm1 = rho01_128i[i+1];
 
@@ -1662,10 +1802,15 @@ void qpsk_qam64(short *stream0_in,
     if (i<((length>>1) - 1)) // false if only 2 REs remain
       stream0_128i_out[i+1] = _mm_unpackhi_epi16(y0r,y0i);
 
+#elif defined(__arm__)
+
+#endif
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 
@@ -1673,18 +1818,20 @@ void qpsk_qam64(short *stream0_in,
 // 16-QAM
 //----------------------------------------------------------------------------------------------
 
-NOCYGWIN_STATIC __m128i ONE_OVER_TWO_SQRT_10 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i NINE_OVER_TWO_SQRT_10 __attribute__((aligned(16)));
+/*
+__m128i ONE_OVER_TWO_SQRT_10 __attribute__((aligned(16)));
+__m128i NINE_OVER_TWO_SQRT_10 __attribute__((aligned(16)));
 
-NOCYGWIN_STATIC __m128i  y0r_over_sqrt10 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0i_over_sqrt10 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0r_three_over_sqrt10 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0i_three_over_sqrt10 __attribute__ ((aligned(16)));
+__m128i  y0r_over_sqrt10 __attribute__ ((aligned(16)));
+__m128i  y0i_over_sqrt10 __attribute__ ((aligned(16)));
+__m128i  y0r_three_over_sqrt10 __attribute__ ((aligned(16)));
+__m128i  y0i_three_over_sqrt10 __attribute__ ((aligned(16)));
 
-NOCYGWIN_STATIC __m128i ch_mag_des __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_over_10 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_over_2 __attribute__ ((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_9_over_10 __attribute__ ((aligned(16)));
+__m128i ch_mag_des __attribute__((aligned(16)));
+__m128i ch_mag_over_10 __attribute__ ((aligned(16)));
+__m128i ch_mag_over_2 __attribute__ ((aligned(16)));
+__m128i ch_mag_9_over_10 __attribute__ ((aligned(16)));
+*/
 
 void qam16_qpsk(short *stream0_in,
                 short *stream1_in,
@@ -1692,42 +1839,56 @@ void qam16_qpsk(short *stream0_in,
                 short *stream0_out,
                 short *rho01,
                 int length
-               )
+		)
 {
 
   /*
-     Author: Sebastian Wagner
-     Date: 2012-06-04
-
-     Input:
-     stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
-     stream!_in:  MF filter for 2nd stream, i.e., y1=h1'*y
-     ch_mag:      2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     ch_mag_i:    2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     rho01:       Channel cross correlation, i.e., h1'*h0
-
-     Output:
-     stream0_out: output LLRs for 1st stream
+    Author: Sebastian Wagner
+    Date: 2012-06-04
+
+    Input:
+    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
+    stream!_in:  MF filter for 2nd stream, i.e., y1=h1'*y
+    ch_mag:      2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    ch_mag_i:    2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    rho01:       Channel cross correlation, i.e., h1'*h0
+
+    Output:
+    stream0_out: output LLRs for 1st stream
   */
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rho01_128i       = (__m128i *)rho01;
   __m128i *stream0_128i_in  = (__m128i *)stream0_in;
   __m128i *stream1_128i_in  = (__m128i *)stream1_in;
   __m128i *stream0_128i_out = (__m128i *)stream0_out;
   __m128i *ch_mag_128i      = (__m128i *)ch_mag;
+  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
+  __m128i ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16)
+  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
+  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
+  __m128i ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16)
+  __m128i NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14)
+  __m128i  y0r_over_sqrt10;
+  __m128i  y0i_over_sqrt10;
+  __m128i  y0r_three_over_sqrt10;
+  __m128i  y0i_three_over_sqrt10;
+  
+  __m128i ch_mag_des;
+  __m128i ch_mag_over_10;
+  __m128i ch_mag_over_2;
+  __m128i ch_mag_9_over_10;
+#elif defined(__arm__)
+
+#endif
 
   int i;
 
-  ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
-  ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16)
-  THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
-  SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
-  ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16)
-  NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14)
 
   for (i=0; i<length>>2; i+=2) {
     // In one iteration, we deal with 8 REs
 
+#if defined(__x86_64__) || defined(__i386__)
     // Get rho
     xmm0 = rho01_128i[i];
     xmm1 = rho01_128i[i+1];
@@ -2107,23 +2268,30 @@ void qam16_qpsk(short *stream0_in,
     stream0_128i_out[2*i+1] = _mm_unpackhi_epi32(xmm0,xmm2);
     stream0_128i_out[2*i+2] = _mm_unpacklo_epi32(xmm1,xmm3);
     stream0_128i_out[2*i+3] = _mm_unpackhi_epi32(xmm1,xmm3);
+
+#elif defined(__arm__)
+
+#endif
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
+
 }
 
 int dlsch_16qam_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                         int **rxdataF_comp,
-                         int **rxdataF_comp_i,
-                         int **dl_ch_mag,   //|h_0|^2*(2/sqrt{10})
-                         int **rho_i,
-                         short *dlsch_llr,
-                         unsigned char symbol,
-                         unsigned char first_symbol_flag,
-                         unsigned short nb_rb,
+                         int32_t **rxdataF_comp,
+                         int32_t **rxdataF_comp_i,
+                         int32_t **dl_ch_mag,   //|h_0|^2*(2/sqrt{10})
+                         int32_t **rho_i,
+                         int16_t *dlsch_llr,
+                         uint8_t symbol,
+                         uint8_t first_symbol_flag,
+                         uint16_t nb_rb,
                          uint16_t pbch_pss_sss_adjust,
-                         short **llr16p)
+                         int16_t **llr16p)
 {
 
   int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
@@ -2182,23 +2350,24 @@ void qam16_qam16(short *stream0_in,
                  short *stream0_out,
                  short *rho01,
                  int length
-                )
+		 )
 {
 
   /*
-     Author: Sebastian Wagner
-     Date: 2012-06-04
-
-     Input:
-     stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
-     stream!_in:  MF filter for 2nd stream, i.e., y1=h1'*y
-     ch_mag:      2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     ch_mag_i:    2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     rho01:       Channel cross correlation, i.e., h1'*h0
-
-     Output:
-     stream0_out: output LLRs for 1st stream
+    Author: Sebastian Wagner
+    Date: 2012-06-04
+
+    Input:
+    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
+    stream!_in:  MF filter for 2nd stream, i.e., y1=h1'*y
+    ch_mag:      2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    ch_mag_i:    2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    rho01:       Channel cross correlation, i.e., h1'*h0
+
+    Output:
+    stream0_out: output LLRs for 1st stream
   */
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rho01_128i       = (__m128i *)rho01;
   __m128i *stream0_128i_in  = (__m128i *)stream0_in;
   __m128i *stream1_128i_in  = (__m128i *)stream1_in;
@@ -2206,18 +2375,32 @@ void qam16_qam16(short *stream0_in,
   __m128i *ch_mag_128i      = (__m128i *)ch_mag;
   __m128i *ch_mag_128i_i    = (__m128i *)ch_mag_i;
 
-  int i;
 
-  ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16)
-  ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
-  THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
-  SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
-  ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16)
-  NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14)
+
+  __m128i ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16)
+  __m128i ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
+  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
+  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
+  __m128i ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16)
+  __m128i NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14)
+  __m128i ch_mag_des,ch_mag_int;
+  __m128i  y0r_over_sqrt10;
+  __m128i  y0i_over_sqrt10;
+  __m128i  y0r_three_over_sqrt10;
+  __m128i  y0i_three_over_sqrt10;
+  __m128i ch_mag_over_10;
+  __m128i ch_mag_over_2;
+  __m128i ch_mag_9_over_10;
+#elif defined(__arm__)
+
+#endif
+
+  int i;
 
   for (i=0; i<length>>2; i+=2) {
     // In one iteration, we deal with 8 REs
 
+#if defined(__x86_64__) || defined(__i386__)
     // Get rho
     xmm0 = rho01_128i[i];
     xmm1 = rho01_128i[i+1];
@@ -2642,24 +2825,30 @@ void qam16_qam16(short *stream0_in,
     stream0_128i_out[2*i+1] = _mm_unpackhi_epi32(xmm0,xmm2);
     stream0_128i_out[2*i+2] = _mm_unpacklo_epi32(xmm1,xmm3);
     stream0_128i_out[2*i+3] = _mm_unpackhi_epi32(xmm1,xmm3);
+#elif defined(__arm__)
+
+#endif
+
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 int dlsch_16qam_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                          int **rxdataF_comp,
-                          int **rxdataF_comp_i,
-                          int **dl_ch_mag,   //|h_0|^2*(2/sqrt{10})
-                          int **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
-                          int **rho_i,
-                          short *dlsch_llr,
-                          unsigned char symbol,
-                          unsigned char first_symbol_flag,
-                          unsigned short nb_rb,
+                          int32_t **rxdataF_comp,
+                          int32_t **rxdataF_comp_i,
+                          int32_t **dl_ch_mag,   //|h_0|^2*(2/sqrt{10})
+                          int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
+                          int32_t **rho_i,
+                          int16_t *dlsch_llr,
+                          uint8_t symbol,
+                          uint8_t first_symbol_flag,
+                          uint16_t nb_rb,
                           uint16_t pbch_pss_sss_adjust,
-                          short **llr16p)
+                          int16_t **llr16p)
 {
 
   int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
@@ -2713,30 +2902,32 @@ int dlsch_16qam_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
   return(0);
 }
 
-void qam16_qam64(short *stream0_in,
-                 short *stream1_in,
-                 short *ch_mag,
-                 short *ch_mag_i,
-                 short *stream0_out,
-                 short *rho01,
-                 int length
-                )
+void qam16_qam64(int16_t *stream0_in,
+                 int16_t *stream1_in,
+                 int16_t *ch_mag,
+                 int16_t *ch_mag_i,
+                 int16_t *stream0_out,
+                 int16_t *rho01,
+                 int32_t length
+		 )
 {
 
   /*
-     Author: Sebastian Wagner
-     Date: 2012-06-04
-
-     Input:
-     stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
-     stream!_in:  MF filter for 2nd stream, i.e., y1=h1'*y
-     ch_mag:      2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     ch_mag_i:    2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     rho01:       Channel cross correlation, i.e., h1'*h0
-
-     Output:
-     stream0_out: output LLRs for 1st stream
+    Author: Sebastian Wagner
+    Date: 2012-06-04
+
+    Input:
+    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
+    stream!_in:  MF filter for 2nd stream, i.e., y1=h1'*y
+    ch_mag:      2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    ch_mag_i:    2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    rho01:       Channel cross correlation, i.e., h1'*h0
+
+    Output:
+    stream0_out: output LLRs for 1st stream
   */
+
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rho01_128i       = (__m128i *)rho01;
   __m128i *stream0_128i_in  = (__m128i *)stream0_in;
   __m128i *stream1_128i_in  = (__m128i *)stream1_in;
@@ -2744,24 +2935,39 @@ void qam16_qam64(short *stream0_in,
   __m128i *ch_mag_128i      = (__m128i *)ch_mag;
   __m128i *ch_mag_128i_i    = (__m128i *)ch_mag_i;
 
-  int i;
+  
+  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
+  __m128i ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16)
+  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
+  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
+  __m128i ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16)
+  __m128i NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14)
+  __m128i ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
+  __m128i THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
+  __m128i FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
+  __m128i SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
+  __m128i SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.
+  __m128i ch_mag_des,ch_mag_int;
+  __m128i  y0r_over_sqrt10;
+  __m128i  y0i_over_sqrt10;
+  __m128i  y0r_three_over_sqrt10;
+  __m128i  y0i_three_over_sqrt10;
+  __m128i ch_mag_over_10;
+  __m128i ch_mag_over_2;
+  __m128i ch_mag_9_over_10;
+  __m128i ch_mag_int_with_sigma2;
+  __m128i two_ch_mag_int_with_sigma2;
+  __m128i three_ch_mag_int_with_sigma2;
+
+#elif defined(__arm__)
 
-  ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
-  ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16)
-  ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
-  THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
-  SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
-  ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16)
-  NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14)
-  ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
-  THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
-  FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
-  SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
-  SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12
+#endif
+  int i;
 
   for (i=0; i<length>>2; i+=2) {
     // In one iteration, we deal with 8 REs
 
+#if defined(__x86_64__) || defined(__i386__)
     // Get rho
     xmm0 = rho01_128i[i];
     xmm1 = rho01_128i[i+1];
@@ -3255,24 +3461,30 @@ void qam16_qam64(short *stream0_in,
     stream0_128i_out[2*i+1] = _mm_unpackhi_epi32(xmm0,xmm2);
     stream0_128i_out[2*i+2] = _mm_unpacklo_epi32(xmm1,xmm3);
     stream0_128i_out[2*i+3] = _mm_unpackhi_epi32(xmm1,xmm3);
+#elif defined(__arm__)
+
+#endif
+
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 int dlsch_16qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                          int **rxdataF_comp,
-                          int **rxdataF_comp_i,
-                          int **dl_ch_mag,   //|h_0|^2*(2/sqrt{10})
-                          int **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
-                          int **rho_i,
-                          short *dlsch_llr,
-                          unsigned char symbol,
-                          unsigned char first_symbol_flag,
-                          unsigned short nb_rb,
+                          int32_t **rxdataF_comp,
+                          int32_t **rxdataF_comp_i,
+                          int32_t **dl_ch_mag,   //|h_0|^2*(2/sqrt{10})
+                          int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
+                          int32_t **rho_i,
+                          int16_t *dlsch_llr,
+                          uint8_t symbol,
+                          uint8_t first_symbol_flag,
+                          uint16_t nb_rb,
                           uint16_t pbch_pss_sss_adjust,
-                          short **llr16p)
+                          int16_t **llr16p)
 {
 
   int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
@@ -3330,93 +3542,117 @@ int dlsch_16qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 // 64-QAM
 //----------------------------------------------------------------------------------------------
 
-NOCYGWIN_STATIC __m128i ONE_OVER_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i THREE_OVER_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i FIVE_OVER_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i SEVEN_OVER_SQRT_42 __attribute__((aligned(16)));
-
-NOCYGWIN_STATIC __m128i FORTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i TWENTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i SEVENTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i NINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i THIRTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i FIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ONE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
-
-NOCYGWIN_STATIC __m128i  y0r_one_over_sqrt_21 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0r_three_over_sqrt_21 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0r_five_over_sqrt_21 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0r_seven_over_sqrt_21 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0i_one_over_sqrt_21 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0i_three_over_sqrt_21 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0i_five_over_sqrt_21 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i  y0i_seven_over_sqrt_21 __attribute__((aligned(16)));
-
-NOCYGWIN_STATIC __m128i ch_mag_98_over_42_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_74_over_42_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_58_over_42_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_50_over_42_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_34_over_42_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_18_over_42_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_26_over_42_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_10_over_42_with_sigma2 __attribute__((aligned(16)));
-NOCYGWIN_STATIC __m128i ch_mag_2_over_42_with_sigma2 __attribute__((aligned(16)));
-
-void qam64_qpsk(short *stream0_in,
-                short *stream1_in,
-                short *ch_mag,
-                short *stream0_out,
-                short *rho01,
-                int length
-               )
+/*
+__m128i ONE_OVER_SQRT_42 __attribute__((aligned(16)));
+__m128i THREE_OVER_SQRT_42 __attribute__((aligned(16)));
+__m128i FIVE_OVER_SQRT_42 __attribute__((aligned(16)));
+__m128i SEVEN_OVER_SQRT_42 __attribute__((aligned(16)));
+
+__m128i FORTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
+__m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
+__m128i TWENTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
+__m128i TWENTYFIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
+__m128i SEVENTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
+__m128i NINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
+__m128i THIRTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
+__m128i FIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
+__m128i ONE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
+
+__m128i  y0r_one_over_sqrt_21 __attribute__((aligned(16)));
+__m128i  y0r_three_over_sqrt_21 __attribute__((aligned(16)));
+__m128i  y0r_five_over_sqrt_21 __attribute__((aligned(16)));
+__m128i  y0r_seven_over_sqrt_21 __attribute__((aligned(16)));
+__m128i  y0i_one_over_sqrt_21 __attribute__((aligned(16)));
+__m128i  y0i_three_over_sqrt_21 __attribute__((aligned(16)));
+__m128i  y0i_five_over_sqrt_21 __attribute__((aligned(16)));
+__m128i  y0i_seven_over_sqrt_21 __attribute__((aligned(16)));
+
+__m128i ch_mag_98_over_42_with_sigma2 __attribute__((aligned(16)));
+__m128i ch_mag_74_over_42_with_sigma2 __attribute__((aligned(16)));
+__m128i ch_mag_58_over_42_with_sigma2 __attribute__((aligned(16)));
+__m128i ch_mag_50_over_42_with_sigma2 __attribute__((aligned(16)));
+__m128i ch_mag_34_over_42_with_sigma2 __attribute__((aligned(16)));
+__m128i ch_mag_18_over_42_with_sigma2 __attribute__((aligned(16)));
+__m128i ch_mag_26_over_42_with_sigma2 __attribute__((aligned(16)));
+__m128i ch_mag_10_over_42_with_sigma2 __attribute__((aligned(16)));
+__m128i ch_mag_2_over_42_with_sigma2 __attribute__((aligned(16)));
+
+*/
+
+void qam64_qpsk(int16_t *stream0_in,
+                int16_t *stream1_in,
+                int16_t *ch_mag,
+                int16_t *stream0_out,
+                int16_t *rho01,
+                int32_t length
+		)
 {
 
   /*
-     Author: S. Wagner
-     Date: 31-07-12
-
-     Input:
-     stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
-     stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
-     ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     rho01:       Channel cross correlation, i.e., h1'*h0
-
-     Output:
-     stream0_out: output LLRs for 1st stream
+    Author: S. Wagner
+    Date: 31-07-12
+
+    Input:
+    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
+    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
+    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    rho01:       Channel cross correlation, i.e., h1'*h0
+
+    Output:
+    stream0_out: output LLRs for 1st stream
   */
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rho01_128i      = (__m128i *)rho01;
   __m128i *stream0_128i_in = (__m128i *)stream0_in;
   __m128i *stream1_128i_in = (__m128i *)stream1_in;
   __m128i *ch_mag_128i     = (__m128i *)ch_mag;
 
-  int i,j;
 
-  ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16)
-  THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16)
-  FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15)
-  SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15)
-  ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
-  ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
-  THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
-  FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
-  SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
-  FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14
-  THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14
-  TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15)
-  TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14
-  SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15)
-  NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15)
-  THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15)
-  FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15)
-  ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15)
-  SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12
+  __m128i ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16)
+  __m128i THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16)
+  __m128i FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15)
+  __m128i SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15)
+  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
+  __m128i FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14
+  __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14
+  __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15)
+  __m128i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14
+  __m128i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15)
+  __m128i NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15)
+  __m128i THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15)
+  __m128i FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15)
+  __m128i ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15)
+
+
+  __m128i ch_mag_des;
+  __m128i ch_mag_98_over_42_with_sigma2;
+  __m128i ch_mag_74_over_42_with_sigma2;
+  __m128i ch_mag_58_over_42_with_sigma2;
+  __m128i ch_mag_50_over_42_with_sigma2;
+  __m128i ch_mag_34_over_42_with_sigma2;
+  __m128i ch_mag_18_over_42_with_sigma2;
+  __m128i ch_mag_26_over_42_with_sigma2;
+  __m128i ch_mag_10_over_42_with_sigma2;
+  __m128i ch_mag_2_over_42_with_sigma2;
+  __m128i  y0r_one_over_sqrt_21;
+  __m128i  y0r_three_over_sqrt_21;
+  __m128i  y0r_five_over_sqrt_21;
+  __m128i  y0r_seven_over_sqrt_21;
+  __m128i  y0i_one_over_sqrt_21;
+  __m128i  y0i_three_over_sqrt_21;
+  __m128i  y0i_five_over_sqrt_21;
+  __m128i  y0i_seven_over_sqrt_21;
+#elif defined(__arm__)
+
+#endif
+  
+  int i,j;
 
   for (i=0; i<length>>2; i+=2) {
 
+#if defined(__x86_64) || defined(__i386__)
     // Get rho
     xmm0 = rho01_128i[i];
     xmm1 = rho01_128i[i+1];
@@ -4734,6 +4970,7 @@ void qam64_qpsk(short *stream0_in,
 
     y2i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);
 
+
     // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
     // RE 1
     j = 24*i;
@@ -4792,25 +5029,29 @@ void qam64_qpsk(short *stream0_in,
     stream0_out[j + 45] = ((short *)&y0i)[7];
     stream0_out[j + 46] = ((short *)&y1i)[7];
     stream0_out[j + 47] = ((short *)&y2i)[7];
+#elif defined(__arm__)
+
+#endif
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 
 int dlsch_64qam_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                         int **rxdataF_comp,
-                         int **rxdataF_comp_i,
-                         int **dl_ch_mag,
-                         int **rho_i,
-                         short *dlsch_llr,
-                         unsigned char symbol,
-                         unsigned char first_symbol_flag,
-                         unsigned short nb_rb,
+                         int32_t **rxdataF_comp,
+                         int32_t **rxdataF_comp_i,
+                         int32_t **dl_ch_mag,
+                         int32_t **rho_i,
+                         int16_t *dlsch_llr,
+                         uint8_t symbol,
+                         uint8_t first_symbol_flag,
+                         uint16_t nb_rb,
                          uint16_t pbch_pss_sss_adjust,
-                         short **llr16p)
+                         int16_t **llr16p)
 {
 
   int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
@@ -4867,61 +5108,80 @@ void qam64_qam16(short *stream0_in,
                  short *stream0_out,
                  short *rho01,
                  int length
-                )
+		 )
 {
 
   /*
-     Author: S. Wagner
-     Date: 31-07-12
-
-     Input:
-     stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
-     stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
-     ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     rho01:       Channel cross correlation, i.e., h1'*h0
-
-     Output:
-     stream0_out: output LLRs for 1st stream
+    Author: S. Wagner
+    Date: 31-07-12
+
+    Input:
+    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
+    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
+    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    rho01:       Channel cross correlation, i.e., h1'*h0
+
+    Output:
+    stream0_out: output LLRs for 1st stream
   */
 
+#if defined(__x86_64__) || defined(__i386__)
+
   __m128i *rho01_128i      = (__m128i *)rho01;
   __m128i *stream0_128i_in = (__m128i *)stream0_in;
   __m128i *stream1_128i_in = (__m128i *)stream1_in;
   __m128i *ch_mag_128i     = (__m128i *)ch_mag;
   __m128i *ch_mag_128i_i   = (__m128i *)ch_mag_i;
 
+  __m128i ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16)
+  __m128i THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16)
+  __m128i FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15)
+  __m128i SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15)
+  __m128i FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14
+  __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14
+  __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15)
+  __m128i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14
+  __m128i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15)
+  __m128i NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15)
+  __m128i THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15)
+  __m128i FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15)
+  __m128i ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15)
+  __m128i ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
+  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
+  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
+
+
+  __m128i ch_mag_int;
+  __m128i ch_mag_des;
+  __m128i ch_mag_98_over_42_with_sigma2;
+  __m128i ch_mag_74_over_42_with_sigma2;
+  __m128i ch_mag_58_over_42_with_sigma2;
+  __m128i ch_mag_50_over_42_with_sigma2;
+  __m128i ch_mag_34_over_42_with_sigma2;
+  __m128i ch_mag_18_over_42_with_sigma2;
+  __m128i ch_mag_26_over_42_with_sigma2;
+  __m128i ch_mag_10_over_42_with_sigma2;
+  __m128i ch_mag_2_over_42_with_sigma2;
+  __m128i  y0r_one_over_sqrt_21;
+  __m128i  y0r_three_over_sqrt_21;
+  __m128i  y0r_five_over_sqrt_21;
+  __m128i  y0r_seven_over_sqrt_21;
+  __m128i  y0i_one_over_sqrt_21;
+  __m128i  y0i_three_over_sqrt_21;
+  __m128i  y0i_five_over_sqrt_21;
+  __m128i  y0i_seven_over_sqrt_21;
+  
+#elif defined(__arm__)
+
+#endif
   int i,j;
 
-  ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16)
-  THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16)
-  FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15)
-  SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15)
-  ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
-  ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
-  THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
-  FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
-  SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
-  FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14
-  THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14
-  TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15)
-  TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14
-  SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15)
-  NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15)
-  THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15)
-  FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15)
-  ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15)
-  SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12
-  ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
-  THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
-  SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
-  //    ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(7327); // round(1/sqrt(10)*2^15)
-  //    THREE_OVER_SQRT_10 = _mm_set1_epi16(21981); // round(3/sqrt(10)*2^15)
-  //    SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15) Q3.13
 
 
   for (i=0; i<length>>2; i+=2) {
 
+#if defined(__x86_64__) || defined(__i386__)
     // Get rho
     xmm0 = rho01_128i[i];
     xmm1 = rho01_128i[i+1];
@@ -6252,6 +6512,7 @@ void qam64_qam16(short *stream0_in,
 
     y2i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);
 
+
     // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
     // RE 1
     j = 24*i;
@@ -6310,26 +6571,32 @@ void qam64_qam16(short *stream0_in,
     stream0_out[j + 45] = ((short *)&y0i)[7];
     stream0_out[j + 46] = ((short *)&y1i)[7];
     stream0_out[j + 47] = ((short *)&y2i)[7];
+
+#elif defined(__arm__)
+
+#endif
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 
 }
 
 
 int dlsch_64qam_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                          int **rxdataF_comp,
-                          int **rxdataF_comp_i,
-                          int **dl_ch_mag,
-                          int **dl_ch_mag_i,
-                          int **rho_i,
-                          short *dlsch_llr,
-                          unsigned char symbol,
-                          unsigned char first_symbol_flag,
-                          unsigned short nb_rb,
+                          int32_t **rxdataF_comp,
+                          int32_t **rxdataF_comp_i,
+                          int32_t **dl_ch_mag,
+                          int32_t **dl_ch_mag_i,
+                          int32_t **rho_i,
+                          int16_t *dlsch_llr,
+                          uint8_t symbol,
+                          uint8_t first_symbol_flag,
+                          uint16_t nb_rb,
                           uint16_t pbch_pss_sss_adjust,
-                          short **llr16p)
+                          int16_t **llr16p)
 {
 
   int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
@@ -6386,54 +6653,85 @@ void qam64_qam64(short *stream0_in,
                  short *stream0_out,
                  short *rho01,
                  int length
-                )
+		 )
 {
 
   /*
-     Author: S. Wagner
-     Date: 31-07-12
-
-     Input:
-     stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
-     stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
-     ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
-     rho01:       Channel cross correlation, i.e., h1'*h0
-
-     Output:
-     stream0_out: output LLRs for 1st stream
+    Author: S. Wagner
+    Date: 31-07-12
+
+    Input:
+    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
+    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
+    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    rho01:       Channel cross correlation, i.e., h1'*h0
+
+    Output:
+    stream0_out: output LLRs for 1st stream
   */
 
+#if defined(__x86_64__) || defined(__i386__)
+
   __m128i *rho01_128i      = (__m128i *)rho01;
   __m128i *stream0_128i_in = (__m128i *)stream0_in;
   __m128i *stream1_128i_in = (__m128i *)stream1_in;
   __m128i *ch_mag_128i     = (__m128i *)ch_mag;
   __m128i *ch_mag_128i_i   = (__m128i *)ch_mag_i;
 
+  __m128i ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16)
+  __m128i THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16)
+  __m128i FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15)
+  __m128i SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(7/sqrt(42)*2^14) Q2.14
+  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
+  __m128i ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
+  __m128i THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
+  __m128i FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
+  __m128i SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
+  __m128i FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14
+  __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14
+  __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15)
+  __m128i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14
+  __m128i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15)
+  __m128i NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15)
+  __m128i THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15)
+  __m128i FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15)
+  __m128i ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15)
+  __m128i SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12
+
+  __m128i ch_mag_des;
+  __m128i ch_mag_int;
+  __m128i ch_mag_98_over_42_with_sigma2;
+  __m128i ch_mag_74_over_42_with_sigma2;
+  __m128i ch_mag_58_over_42_with_sigma2;
+  __m128i ch_mag_50_over_42_with_sigma2;
+  __m128i ch_mag_34_over_42_with_sigma2;
+  __m128i ch_mag_18_over_42_with_sigma2;
+  __m128i ch_mag_26_over_42_with_sigma2;
+  __m128i ch_mag_10_over_42_with_sigma2;
+  __m128i ch_mag_2_over_42_with_sigma2;
+  __m128i  y0r_one_over_sqrt_21;
+  __m128i  y0r_three_over_sqrt_21;
+  __m128i  y0r_five_over_sqrt_21;
+  __m128i  y0r_seven_over_sqrt_21;
+  __m128i  y0i_one_over_sqrt_21;
+  __m128i  y0i_three_over_sqrt_21;
+  __m128i  y0i_five_over_sqrt_21;
+  __m128i  y0i_seven_over_sqrt_21;
+  __m128i ch_mag_int_with_sigma2;
+  __m128i two_ch_mag_int_with_sigma2;
+  __m128i three_ch_mag_int_with_sigma2;  
+#elif defined(__arm__)
+
+#endif
+
   int i,j;
 
-  ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16)
-  THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16)
-  FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15)
-  SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(7/sqrt(42)*2^14) Q2.14
-  ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
-  ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
-  THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
-  FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
-  SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
-  FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14
-  THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14
-  TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15)
-  TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14
-  SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15)
-  NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15)
-  THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15)
-  FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15)
-  ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15)
-  SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12
 
   for (i=0; i<length>>2; i+=2) {
 
+#if defined(__x86_64__) || defined(__i386__)
+
     // Get rho
     xmm0 = rho01_128i[i];
     xmm1 = rho01_128i[i+1];
@@ -8027,6 +8325,7 @@ void qam64_qam64(short *stream0_in,
 
     y2i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);
 
+
     // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
     // RE 1
     j = 24*i;
@@ -8085,26 +8384,32 @@ void qam64_qam64(short *stream0_in,
     stream0_out[j + 45] = ((short *)&y0i)[7];
     stream0_out[j + 46] = ((short *)&y1i)[7];
     stream0_out[j + 47] = ((short *)&y2i)[7];
+
+#elif defined(__arm__)
+
+#endif
+
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 
 int dlsch_64qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
-                          int **rxdataF_comp,
-                          int **rxdataF_comp_i,
-                          int **dl_ch_mag,
-                          int **dl_ch_mag_i,
-                          int **rho_i,
-                          short *dlsch_llr,
-                          unsigned char symbol,
-                          unsigned char first_symbol_flag,
-                          unsigned short nb_rb,
+                          int32_t **rxdataF_comp,
+                          int32_t **rxdataF_comp_i,
+                          int32_t **dl_ch_mag,
+                          int32_t **dl_ch_mag_i,
+                          int32_t **rho_i,
+                          int16_t *dlsch_llr,
+                          uint8_t symbol,
+                          uint8_t first_symbol_flag,
+                          uint16_t nb_rb,
                           uint16_t pbch_pss_sss_adjust,
-                          short **llr16p)
+                          int16_t **llr16p)
 {
 
   int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c
index c4b0247d75..5012d0ac3f 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c
@@ -262,7 +262,7 @@ int allocate_REs_in_RB(LTE_DL_FRAME_PARMS *frame_parms,
         switch (mod_order0) {
         case 2:  //QPSK
 
-          //printf("%d(%d) : %d,%d => ",tti_offset,*jj,((int16_t*)&txdataF[0][tti_offset])[0],((int16_t*)&txdataF[0][tti_offset])[1]);
+//          printf("%d(%d) : %d,%d => ",tti_offset,*jj,((int16_t*)&txdataF[0][tti_offset])[0],((int16_t*)&txdataF[0][tti_offset])[1]);
           for (aa=0; aa<frame_parms->nb_antennas_tx; aa++) {
             ((int16_t*)&txdataF[aa][tti_offset])[0] += (x0[*jj]==1) ? (-gain_lin_QPSK) : gain_lin_QPSK; //I //b_i
           }
@@ -275,7 +275,7 @@ int allocate_REs_in_RB(LTE_DL_FRAME_PARMS *frame_parms,
 
           *jj = *jj + 1;
 
-          // printf("%d,%d\n",((int16_t*)&txdataF[0][tti_offset])[0],((int16_t*)&txdataF[0][tti_offset])[1]);
+ //         printf("%d,%d\n",((int16_t*)&txdataF[0][tti_offset])[0],((int16_t*)&txdataF[0][tti_offset])[1]);
           break;
 
         case 4:  //16QAM
diff --git a/openair1/PHY/LTE_TRANSPORT/pbch.c b/openair1/PHY/LTE_TRANSPORT/pbch.c
index c4b18337a5..0582a63956 100755
--- a/openair1/PHY/LTE_TRANSPORT/pbch.c
+++ b/openair1/PHY/LTE_TRANSPORT/pbch.c
@@ -531,7 +531,7 @@ uint16_t pbch_extract(int **rxdataF,
   return(0);
 }
 
-__m128i avg128;
+//__m128i avg128;
 
 //compute average channel_level on each (TX,RX) antenna pair
 int pbch_channel_level(int **dl_ch_estimates_ext,
@@ -541,7 +541,14 @@ int pbch_channel_level(int **dl_ch_estimates_ext,
 
   int16_t rb, nb_rb=6;
   uint8_t aatx,aarx;
+
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i avg128;
   __m128i *dl_ch128;
+#elif defined(__arm__)
+  int32x4_t avg128;
+  int16x8_t *dl_ch128;
+#endif
   int avg1=0,avg2=0;
 
   uint32_t nsymb = (frame_parms->Ncp==0) ? 7:6;
@@ -550,15 +557,23 @@ int pbch_channel_level(int **dl_ch_estimates_ext,
   for (aatx=0; aatx<4; aatx++) //frame_parms->nb_antennas_tx_eNB;aatx++)
     for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
       //clear average level
+
+#if defined(__x86_64__) || defined(__i386__)
       avg128 = _mm_setzero_si128();
       dl_ch128=(__m128i *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol_mod*6*12];
+#elif defined(__arm__)
+      avg128 = vdupq_n_s32(0);
+      dl_ch128=(int16x8_t *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol_mod*6*12];
 
+#endif
       for (rb=0; rb<nb_rb; rb++) {
-
+#if defined(__x86_64__) || defined(__i386__)
         avg128 = _mm_add_epi32(avg128,_mm_madd_epi16(dl_ch128[0],dl_ch128[0]));
         avg128 = _mm_add_epi32(avg128,_mm_madd_epi16(dl_ch128[1],dl_ch128[1]));
         avg128 = _mm_add_epi32(avg128,_mm_madd_epi16(dl_ch128[2],dl_ch128[2]));
-
+#elif defined(__arm__)
+// to be filled in
+#endif
         dl_ch128+=3;
         /*
           if (rb==0) {
@@ -579,16 +594,19 @@ int pbch_channel_level(int **dl_ch_estimates_ext,
 
       //msg("Channel level : %d, %d\n",avg1, avg2);
     }
-
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
   return(avg2);
 
 }
 
+#if defined(__x86_64__) || defined(__i386__)
 __m128i mmtmpP0,mmtmpP1,mmtmpP2,mmtmpP3;
-
+#elif defined(__arm__)
+int16x8_t mmtmpP0,mmtmpP1,mmtmpP2,mmtmpP3;
+#endif
 void pbch_channel_compensation(int **rxdataF_ext,
                                int **dl_ch_estimates_ext,
                                int **rxdataF_comp,
@@ -599,21 +617,28 @@ void pbch_channel_compensation(int **rxdataF_ext,
 
   uint16_t rb,nb_rb=6;
   uint8_t aatx,aarx,symbol_mod;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *dl_ch128,*rxdataF128,*rxdataF_comp128;
+#elif defined(__arm__)
 
+#endif
   symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
 
   for (aatx=0; aatx<4; aatx++) //frame_parms->nb_antennas_tx_eNB;aatx++)
     for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
 
+#if defined(__x86_64__) || defined(__i386__)
       dl_ch128          = (__m128i *)&dl_ch_estimates_ext[(aatx<<1)+aarx][symbol_mod*6*12];
       rxdataF128        = (__m128i *)&rxdataF_ext[aarx][symbol_mod*6*12];
       rxdataF_comp128   = (__m128i *)&rxdataF_comp[(aatx<<1)+aarx][symbol_mod*6*12];
 
+#elif defined(__arm__)
+// to be filled in
+#endif
 
       for (rb=0; rb<nb_rb; rb++) {
         //printf("rb %d\n",rb);
-
+#if defined(__x86_64__) || defined(__i386__)
         // multiply by conjugated channel
         mmtmpP0 = _mm_madd_epi16(dl_ch128[0],rxdataF128[0]);
         //  print_ints("re",&mmtmpP0);
@@ -680,11 +705,15 @@ void pbch_channel_compensation(int **rxdataF_ext,
           rxdataF128+=2;
           rxdataF_comp128+=2;
         }
+#elif defined(__arm__)
+// to be filled in
+#endif
       }
     }
-
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 void pbch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
@@ -694,24 +723,38 @@ void pbch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
 
   uint8_t aatx, symbol_mod;
   int i, nb_rb=6;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxdataF_comp128_0,*rxdataF_comp128_1;
-
+#elif defined(__arm__)
+  int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1;
+#endif
   symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
 
   if (frame_parms->nb_antennas_rx>1) {
     for (aatx=0; aatx<4; aatx++) { //frame_parms->nb_antennas_tx_eNB;aatx++) {
+#if defined(__x86_64__) || defined(__i386__)
       rxdataF_comp128_0   = (__m128i *)&rxdataF_comp[(aatx<<1)][symbol_mod*6*12];
       rxdataF_comp128_1   = (__m128i *)&rxdataF_comp[(aatx<<1)+1][symbol_mod*6*12];
+#elif defined(__arm__)
+      rxdataF_comp128_0   = (int16x8_t *)&rxdataF_comp[(aatx<<1)][symbol_mod*6*12];
+      rxdataF_comp128_1   = (int16x8_t *)&rxdataF_comp[(aatx<<1)+1][symbol_mod*6*12];
 
+#endif
       // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation)
       for (i=0; i<nb_rb*3; i++) {
+#if defined(__x86_64__) || defined(__i386__)
         rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1));
+#elif defined(__arm__)
+        rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]);
+
+#endif
       }
     }
   }
-
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 void pbch_scrambling(LTE_DL_FRAME_PARMS *frame_parms,
@@ -806,9 +849,6 @@ void pbch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,
 
   }
 
-  _mm_empty();
-  _m_empty();
-
 }
 
 void pbch_quantize(int8_t *pbch_llr8,
diff --git a/openair1/PHY/LTE_TRANSPORT/pmch.c b/openair1/PHY/LTE_TRANSPORT/pmch.c
index 3082bff917..dfa38fca59 100644
--- a/openair1/PHY/LTE_TRANSPORT/pmch.c
+++ b/openair1/PHY/LTE_TRANSPORT/pmch.c
@@ -396,22 +396,33 @@ void mch_channel_level(int **dl_ch_estimates_ext,
 {
 
   int i,aarx,nre;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *dl_ch128,avg128;
-
+#elif defined(__arm__)
+  int32x4_t avg128; 
+#endif
   for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
-    //clear average level
+#if defined(__x86_64__) || defined(__i386__)
+   //clear average level
     avg128 = _mm_setzero_si128();
     // 5 is always a symbol with no pilots for both normal and extended prefix
 
     dl_ch128=(__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
+#elif defined(__arm__)
+
 
+#endif
     if ((symbol == 2) || (symbol == 6) || (symbol == 10))
       nre = (frame_parms->N_RB_DL*6);
     else
       nre = (frame_parms->N_RB_DL*12);
 
     for (i=0; i<(nre>>2); i++) {
+#if defined(__x86_64__) || defined(__i386__)
       avg128 = _mm_add_epi32(avg128,_mm_madd_epi16(dl_ch128[0],dl_ch128[0]));
+#elif defined(__arm__)
+
+#endif
     }
 
     avg[aarx] = (((int*)&avg128)[0] +
@@ -422,9 +433,10 @@ void mch_channel_level(int **dl_ch_estimates_ext,
     //            printf("Channel level : %d\n",avg[(aatx<<1)+aarx]);
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 void mch_channel_compensation(int **rxdataF_ext,
@@ -439,14 +451,18 @@ void mch_channel_compensation(int **rxdataF_ext,
 {
 
   int aarx,nre,i;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *dl_ch128,*dl_ch_mag128,*dl_ch_mag128b,*rxdataF128,*rxdataF_comp128;
   __m128i mmtmpD0,mmtmpD1,mmtmpD2,mmtmpD3,QAM_amp128,QAM_amp128b;
+#elif defined(__arm__)
 
+#endif
   if ((symbol == 2) || (symbol == 6) || (symbol == 10))
     nre = frame_parms->N_RB_DL*6;
   else
     nre = frame_parms->N_RB_DL*12;
 
+#if defined(__x86_64__) || defined(__i386__)
   if (mod_order == 4) {
     QAM_amp128 = _mm_set1_epi16(QAM16_n1);  // 2/sqrt(10)
     QAM_amp128b = _mm_setzero_si128();
@@ -454,21 +470,27 @@ void mch_channel_compensation(int **rxdataF_ext,
     QAM_amp128  = _mm_set1_epi16(QAM64_n1); //
     QAM_amp128b = _mm_set1_epi16(QAM64_n2);
   }
+#elif defined(__arm__)
 
-
+#endif
 
   for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
 
+#if defined(__x86_64__) || defined(__i386__)
+
     dl_ch128          = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
     dl_ch_mag128      = (__m128i *)&dl_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12];
     dl_ch_mag128b     = (__m128i *)&dl_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12];
     rxdataF128        = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
     rxdataF_comp128   = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12];
+#elif defined(__arm__)
 
+#endif
 
     for (i=0; i<(nre>>2); i+=2) {
       if (mod_order>2) {
         // get channel amplitude if not QPSK
+#if defined(__x86_64__) || defined(__i386__)
 
         mmtmpD0 = _mm_madd_epi16(dl_ch128[0],dl_ch128[0]);
         mmtmpD0 = _mm_srai_epi32(mmtmpD0,output_shift);
@@ -498,8 +520,13 @@ void mch_channel_compensation(int **rxdataF_ext,
         dl_ch_mag128b[1] = _mm_mulhi_epi16(dl_ch_mag128b[1],QAM_amp128b);
         dl_ch_mag128b[1] = _mm_slli_epi16(dl_ch_mag128b[1],1);
 
+#elif defined(__arm__)
+
+#endif
       }
 
+#if defined(__x86_64__) || defined(__i386__)
+
       // multiply by conjugated channel
       mmtmpD0 = _mm_madd_epi16(dl_ch128[0],rxdataF128[0]);
       //  print_ints("re",&mmtmpD0);
@@ -548,12 +575,17 @@ void mch_channel_compensation(int **rxdataF_ext,
       rxdataF128+=2;
       rxdataF_comp128+=2;
 
+#elif defined(__arm__)
 
+#endif
     }
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
+
 }
 
 void mch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
@@ -565,10 +597,15 @@ void mch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
 
 
   int i;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxdataF_comp128_0,*rxdataF_comp128_1,*dl_ch_mag128_0,*dl_ch_mag128_1,*dl_ch_mag128_0b,*dl_ch_mag128_1b;
-
+#elif defined(__arm__)
+  int16x8_t *rxdataF_comp128_0,*rxdataF_comp128_1,*dl_ch_mag128_0,*dl_ch_mag128_1,*dl_ch_mag128_0b,*dl_ch_mag128_1b;
+#endif
   if (frame_parms->nb_antennas_rx>1) {
 
+#if defined(__x86_64__) || defined(__i386__)
+
     rxdataF_comp128_0   = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12];
     rxdataF_comp128_1   = (__m128i *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12];
     dl_ch_mag128_0      = (__m128i *)&dl_ch_mag[0][symbol*frame_parms->N_RB_DL*12];
@@ -576,16 +613,32 @@ void mch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
     dl_ch_mag128_0b     = (__m128i *)&dl_ch_magb[0][symbol*frame_parms->N_RB_DL*12];
     dl_ch_mag128_1b     = (__m128i *)&dl_ch_magb[1][symbol*frame_parms->N_RB_DL*12];
 
+#elif defined(__arm__)
+    rxdataF_comp128_0   = (int16x8_t *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12];
+    rxdataF_comp128_1   = (int16x8_t *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12];
+    dl_ch_mag128_0      = (int16x8_t *)&dl_ch_mag[0][symbol*frame_parms->N_RB_DL*12];
+    dl_ch_mag128_1      = (int16x8_t *)&dl_ch_mag[1][symbol*frame_parms->N_RB_DL*12];
+    dl_ch_mag128_0b     = (int16x8_t *)&dl_ch_magb[0][symbol*frame_parms->N_RB_DL*12];
+    dl_ch_mag128_1b     = (int16x8_t *)&dl_ch_magb[1][symbol*frame_parms->N_RB_DL*12];
+
+#endif
     // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation)
     for (i=0; i<frame_parms->N_RB_DL*3; i++) {
+#if defined(__x86_64__) || defined(__i386__)
       rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1));
       dl_ch_mag128_0[i]    = _mm_adds_epi16(_mm_srai_epi16(dl_ch_mag128_0[i],1),_mm_srai_epi16(dl_ch_mag128_1[i],1));
       dl_ch_mag128_0b[i]   = _mm_adds_epi16(_mm_srai_epi16(dl_ch_mag128_0b[i],1),_mm_srai_epi16(dl_ch_mag128_1b[i],1));
+#elif defined(__arm__)
+      rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]);
+      dl_ch_mag128_0[i]    = vhaddq_s16(dl_ch_mag128_0[i],dl_ch_mag128_1[i]);
+      dl_ch_mag128_0b[i]   = vhaddq_s16(dl_ch_mag128_0b[i],dl_ch_mag128_1b[i]);
+#endif
     }
   }
-
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 int mch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
@@ -626,8 +679,10 @@ int mch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
 
   *llr32p = (short *)llr32;
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 
   return(0);
 }
@@ -644,22 +699,38 @@ void mch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
                    int16_t **llr32p)
 {
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
   __m128i *ch_mag;
   __m128i llr128[2],xmm0;
+  uint32_t *llr32;
+#elif defined(__arm__)
+  int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16x8_t *ch_mag;
+  int16x8_t llr128[2],xmm0;
+  int16_t *llr16;
+#endif
   int i,len;
   unsigned char len_mod4=0;
-  uint32_t *llr32;
 
+#if defined(__x86_64__) || defined(__i386__)
   if (symbol==2) {
     llr32 = (uint32_t*)dlsch_llr;
   } else {
     llr32 = (uint32_t*)*llr32p;
   }
-
-
+#elif defined(__arm__)
+  if (symbol==2) {
+    llr16 = (int16_t*)dlsch_llr;
+  } else {
+    llr16 = (int16_t*)*llr32p;
+  }
+#endif
+#if defined(__x86_64__) || defined(__i386__)
   ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
-
+#elif defined(__arm__)
+  ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+#endif
   if ((symbol==2) || (symbol==6) || (symbol==10)) {
     len = frame_parms->N_RB_DL*6;
   } else {
@@ -680,6 +751,7 @@ void mch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 
   for (i=0; i<len; i++) {
 
+#if defined(__x86_64__) || defined(__i386__)
     xmm0 = _mm_abs_epi16(rxF[i]);
     xmm0 = _mm_subs_epi16(ch_mag[i],xmm0);
 
@@ -695,10 +767,38 @@ void mch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
     llr32[6] = ((uint32_t *)&llr128[1])[2];
     llr32[7] = ((uint32_t *)&llr128[1])[3];
     llr32+=8;
+
+#elif defined(__arm__)
+    xmm0 = vabsq_s16(rxF[i]);
+    xmm0 = vsubq_s16(ch_mag[i],xmm0);
+
+    // lambda_1=y_R, lambda_2=|y_R|-|h|^2, lamda_3=y_I, lambda_4=|y_I|-|h|^2
+
+    llr16[0] = vgetq_lane_s16(rxF[i],0);
+    llr16[1] = vgetq_lane_s16(xmm0,0);
+    llr16[2] = vgetq_lane_s16(rxF[i],1);
+    llr16[3] = vgetq_lane_s16(xmm0,1);
+    llr16[4] = vgetq_lane_s16(rxF[i],2);
+    llr16[5] = vgetq_lane_s16(xmm0,2);
+    llr16[6] = vgetq_lane_s16(rxF[i],2);
+    llr16[7] = vgetq_lane_s16(xmm0,3);
+    llr16[8] = vgetq_lane_s16(rxF[i],4);
+    llr16[9] = vgetq_lane_s16(xmm0,4);
+    llr16[10] = vgetq_lane_s16(rxF[i],5);
+    llr16[11] = vgetq_lane_s16(xmm0,5);
+    llr16[12] = vgetq_lane_s16(rxF[i],6);
+    llr16[13] = vgetq_lane_s16(xmm0,6);
+    llr16[14] = vgetq_lane_s16(rxF[i],7);
+    llr16[15] = vgetq_lane_s16(xmm0,7);
+    llr16+=16;
+#endif
+
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 //----------------------------------------------------------------------------------------------
@@ -714,8 +814,13 @@ void mch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
                    short **llr_save)
 {
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i xmm1,xmm2,*ch_mag,*ch_magb;
   __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+#elif defined(__arm__)
+  int16x8_t xmm1,xmm2,*ch_mag,*ch_magb;
+  int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+#endif
 
   int i,len,len2;
   //   int j=0;
@@ -728,9 +833,13 @@ void mch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
   else
     llr = *llr_save;
 
+#if defined(__x86_64__) || defined(__i386__)
   ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
   ch_magb = (__m128i*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];
-
+#elif defined(__arm__)
+  ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+  ch_magb = (int16x8_t*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];
+#endif
   if ((symbol==2) || (symbol==6) || (symbol==10)) {
     len = frame_parms->N_RB_DL*6;
   } else {
@@ -747,11 +856,18 @@ void mch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 
 
   for (i=0; i<len2; i++) {
-
+#if defined(__x86_64__) || defined(__i386__)
     xmm1 = _mm_abs_epi16(rxF[i]);
     xmm1  = _mm_subs_epi16(ch_mag[i],xmm1);
     xmm2 = _mm_abs_epi16(xmm1);
     xmm2 = _mm_subs_epi16(ch_magb[i],xmm2);
+#elif defined(__arm__)
+    xmm1 = vabsq_s16(rxF[i]);
+    xmm1 = vsubq_s16(ch_mag[i],xmm1);
+    xmm2 = vabsq_s16(xmm1);
+    xmm2 = vsubq_s16(ch_magb[i],xmm2);
+#endif
+
     /*
       printf("pmch i: %d => mag (%d,%d) (%d,%d)\n",i,((short *)&ch_mag[i])[0],((short *)&ch_magb[i])[0],
       ((short *)&rxF[i])[0],((short *)&rxF[i])[1]);
@@ -771,41 +887,68 @@ void mch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
     */
     llr2[0] = ((short *)&rxF[i])[0];
     llr2[1] = ((short *)&rxF[i])[1];
+#if defined(__x86_64__) || defined(__i386__)
     llr2[2] = _mm_extract_epi16(xmm1,0);
     llr2[3] = _mm_extract_epi16(xmm1,1);//((short *)&xmm1)[j+1];
     llr2[4] = _mm_extract_epi16(xmm2,0);//((short *)&xmm2)[j];
     llr2[5] = _mm_extract_epi16(xmm2,1);//((short *)&xmm2)[j+1];
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,0);
+    llr2[3] = vgetq_lane_s16(xmm1,1);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,0);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,1);//((short *)&xmm2)[j+1];
+#endif
 
     llr2+=6;
     llr2[0] = ((short *)&rxF[i])[2];
     llr2[1] = ((short *)&rxF[i])[3];
+#if defined(__x86_64__) || defined(__i386__)
     llr2[2] = _mm_extract_epi16(xmm1,2);
     llr2[3] = _mm_extract_epi16(xmm1,3);//((short *)&xmm1)[j+1];
     llr2[4] = _mm_extract_epi16(xmm2,2);//((short *)&xmm2)[j];
     llr2[5] = _mm_extract_epi16(xmm2,3);//((short *)&xmm2)[j+1];
-
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,2);
+    llr2[3] = vgetq_lane_s16(xmm1,3);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,2);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,3);//((short *)&xmm2)[j+1];
+#endif
     llr2+=6;
     llr2[0] = ((short *)&rxF[i])[4];
     llr2[1] = ((short *)&rxF[i])[5];
+#if defined(__x86_64__) || defined(__i386__)
     llr2[2] = _mm_extract_epi16(xmm1,4);
     llr2[3] = _mm_extract_epi16(xmm1,5);//((short *)&xmm1)[j+1];
     llr2[4] = _mm_extract_epi16(xmm2,4);//((short *)&xmm2)[j];
     llr2[5] = _mm_extract_epi16(xmm2,5);//((short *)&xmm2)[j+1];
-
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,4);
+    llr2[3] = vgetq_lane_s16(xmm1,5);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,4);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,5);//((short *)&xmm2)[j+1];
+#endif
     llr2+=6;
     llr2[0] = ((short *)&rxF[i])[6];
     llr2[1] = ((short *)&rxF[i])[7];
+#if defined(__x86_64__) || defined(__i386__)
     llr2[2] = _mm_extract_epi16(xmm1,6);
     llr2[3] = _mm_extract_epi16(xmm1,7);//((short *)&xmm1)[j+1];
     llr2[4] = _mm_extract_epi16(xmm2,6);//((short *)&xmm2)[j];
     llr2[5] = _mm_extract_epi16(xmm2,7);//((short *)&xmm2)[j+1];
-
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,6);
+    llr2[3] = vgetq_lane_s16(xmm1,7);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,6);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,7);//((short *)&xmm2)[j+1];
+#endif
     llr2+=6;
   }
 
   *llr_save = llr;
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 }
 
 int avg_pmch[4];
diff --git a/openair1/PHY/LTE_TRANSPORT/prach.c b/openair1/PHY/LTE_TRANSPORT/prach.c
index 4749598616..73c3957b6d 100644
--- a/openair1/PHY/LTE_TRANSPORT/prach.c
+++ b/openair1/PHY/LTE_TRANSPORT/prach.c
@@ -998,10 +998,7 @@ int32_t generate_prach( PHY_VARS_UE *phy_vars_ue, uint8_t eNB_id, uint8_t subfra
 
   return signal_energy( (int*)prach, 256 );
 }
-
-
-
-__m128i mmtmpX0,mmtmpX1,mmtmpX2,mmtmpX3;
+//__m128i mmtmpX0,mmtmpX1,mmtmpX2,mmtmpX3;
 
 void rx_prach(PHY_VARS_eNB *phy_vars_eNB,uint8_t subframe,uint16_t *preamble_energy_list, uint16_t *preamble_delay_list, uint16_t Nf, uint8_t tdd_mapindex)
 {
diff --git a/openair1/PHY/LTE_TRANSPORT/proto.h b/openair1/PHY/LTE_TRANSPORT/proto.h
index 5fe7518004..0f671695d9 100644
--- a/openair1/PHY/LTE_TRANSPORT/proto.h
+++ b/openair1/PHY/LTE_TRANSPORT/proto.h
@@ -48,7 +48,7 @@
  * @{
  */
 
-/** \fn free_eNB_dlsch(LTE_eNB_DLSCH_t *dlsch)
+/** \fn free_eNB_dlsch(LTE_eNB_DLSCH_t *dlsch,unsigned char N_RB_DL)
     \brief This function frees memory allocated for a particular DLSCH at eNB
     @param dlsch Pointer to DLSCH to be removed
 */
@@ -74,9 +74,7 @@ void free_ue_dlsch(LTE_UE_DLSCH_t *dlsch);
 
 LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint8_t max_turbo_iterations,uint8_t N_RB_DL, uint8_t abstraction_flag);
 
-void free_eNB_dlsch(LTE_eNB_DLSCH_t *dlsch);
 
-LTE_eNB_ULSCH_t *new_eNB_ulsch(uint8_t Mdlharq,uint8_t max_turbo_iterations,uint8_t N_RB_UL, uint8_t abstraction_flag);
 
 void clean_eNb_ulsch(LTE_eNB_ULSCH_t *ulsch, uint8_t abstraction_flag);
 
diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c
index c6c7876c09..3ec50fda82 100644
--- a/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/ulsch_demodulation.c
@@ -52,15 +52,19 @@
 //extern int **ulchmag_eren;
 //eren
 
-
 static short jitter[8]  __attribute__ ((aligned(16))) = {1,0,0,1,0,1,1,0};
 static short jitterc[8] __attribute__ ((aligned(16))) = {0,1,1,0,1,0,0,1};
 
 #ifndef OFDMA_ULSCH
 void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH)
 {
-
+#if defined(__x86_64__) || defined(__i386__)
   __m128i idft_in128[3][1200],idft_out128[3][1200];
+  __m128i norm128;
+#elif defined(__arm__)
+  int16x8_t idft_in128[3][1200],idft_out128[3][1200];
+  int16x8_t norm128;
+#endif
   int16_t *idft_in0=(int16_t*)idft_in128[0],*idft_out0=(int16_t*)idft_out128[0];
   int16_t *idft_in1=(int16_t*)idft_in128[1],*idft_out1=(int16_t*)idft_out128[1];
   int16_t *idft_in2=(int16_t*)idft_in128[2],*idft_out2=(int16_t*)idft_out128[2];
@@ -68,7 +72,7 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH)
   uint32_t *z0,*z1,*z2,*z3,*z4,*z5,*z6,*z7,*z8,*z9,*z10=NULL,*z11=NULL;
   int i,ip;
 
-  __m128i norm128;
+
 
   //  printf("Doing lte_idft for Msc_PUSCH %d\n",Msc_PUSCH);
 
@@ -108,6 +112,7 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH)
 
   // conjugate input
   for (i=0; i<(Msc_PUSCH>>2); i++) {
+#if defined(__x86_64__)||defined(__i386__)
     *&(((__m128i*)z0)[i])=_mm_sign_epi16(*&(((__m128i*)z0)[i]),*(__m128i*)&conjugate2[0]);
     *&(((__m128i*)z1)[i])=_mm_sign_epi16(*&(((__m128i*)z1)[i]),*(__m128i*)&conjugate2[0]);
     *&(((__m128i*)z2)[i])=_mm_sign_epi16(*&(((__m128i*)z2)[i]),*(__m128i*)&conjugate2[0]);
@@ -119,10 +124,29 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH)
     *&(((__m128i*)z8)[i])=_mm_sign_epi16(*&(((__m128i*)z8)[i]),*(__m128i*)&conjugate2[0]);
     *&(((__m128i*)z9)[i])=_mm_sign_epi16(*&(((__m128i*)z9)[i]),*(__m128i*)&conjugate2[0]);
 
-    if (frame_parms->Ncp==0) {
+    if (frame_parms->Ncp==NORMAL) {
       *&(((__m128i*)z10)[i])=_mm_sign_epi16(*&(((__m128i*)z10)[i]),*(__m128i*)&conjugate2[0]);
       *&(((__m128i*)z11)[i])=_mm_sign_epi16(*&(((__m128i*)z11)[i]),*(__m128i*)&conjugate2[0]);
     }
+#elif defined(__arm__)
+    *&(((int16x8_t*)z0)[i])=vmulq_s16(*&(((int16x8_t*)z0)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z1)[i])=vmulq_s16(*&(((int16x8_t*)z1)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z2)[i])=vmulq_s16(*&(((int16x8_t*)z2)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z3)[i])=vmulq_s16(*&(((int16x8_t*)z3)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z4)[i])=vmulq_s16(*&(((int16x8_t*)z4)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z5)[i])=vmulq_s16(*&(((int16x8_t*)z5)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z6)[i])=vmulq_s16(*&(((int16x8_t*)z6)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z7)[i])=vmulq_s16(*&(((int16x8_t*)z7)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z8)[i])=vmulq_s16(*&(((int16x8_t*)z8)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z9)[i])=vmulq_s16(*&(((int16x8_t*)z9)[i]),*(int16x8_t*)&conjugate2[0]);
+
+
+    if (frame_parms->Ncp==NORMAL) {
+      *&(((int16x8_t*)z10)[i])=vmulq_s16(*&(((int16x8_t*)z10)[i]),*(int16x8_t*)&conjugate2[0]);
+      *&(((int16x8_t*)z11)[i])=vmulq_s16(*&(((int16x8_t*)z11)[i]),*(int16x8_t*)&conjugate2[0]);
+    }
+
+#endif
   }
 
   for (i=0,ip=0; i<Msc_PUSCH; i++,ip+=4) {
@@ -150,23 +174,21 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH)
     dft12((int16_t *)idft_in1,(int16_t *)idft_out1);
     dft12((int16_t *)idft_in2,(int16_t *)idft_out2);
 
-    /*
-    dft12f(&((__m128i *)idft_in0)[0],&((__m128i *)idft_in0)[1],&((__m128i *)idft_in0)[2],&((__m128i *)idft_in0)[3],&((__m128i *)idft_in0)[4],&((__m128i *)idft_in0)[5],&((__m128i *)idft_in0)[6],&((__m128i *)idft_in0)[7],&((__m128i *)idft_in0)[8],&((__m128i *)idft_in0)[9],&((__m128i *)idft_in0)[10],&((__m128i *)idft_in0)[11],
-    &((__m128i *)idft_out0)[0],&((__m128i *)idft_out0)[1],&((__m128i *)idft_out0)[2],&((__m128i *)idft_out0)[3],&((__m128i *)idft_out0)[4],&((__m128i *)idft_out0)[5],&((__m128i *)idft_out0)[6],&((__m128i *)idft_out0)[7],&((__m128i *)idft_out0)[8],&((__m128i *)idft_out0)[9],&((__m128i *)idft_out0)[10],&((__m128i *)idft_out0)[11]);
-
-    dft12f(&((__m128i *)idft_in1)[0],&((__m128i *)idft_in1)[1],&((__m128i *)idft_in1)[2],&((__m128i *)idft_in1)[3],&((__m128i *)idft_in1)[4],&((__m128i *)idft_in1)[5],&((__m128i *)idft_in1)[6],&((__m128i *)idft_in1)[7],&((__m128i *)idft_in1)[8],&((__m128i *)idft_in1)[9],&((__m128i *)idft_in1)[10],&((__m128i *)idft_in1)[11],
-    &((__m128i *)idft_out1)[0],&((__m128i *)idft_out1)[1],&((__m128i *)idft_out1)[2],&((__m128i *)idft_out1)[3],&((__m128i *)idft_out1)[4],&((__m128i *)idft_out1)[5],&((__m128i *)idft_out1)[6],&((__m128i *)idft_out1)[7],&((__m128i *)idft_out1)[8],&((__m128i *)idft_out1)[9],&((__m128i *)idft_out1)[10],&((__m128i *)idft_out1)[11]);
-
-    dft12f(&((__m128i *)idft_in2)[0],&((__m128i *)idft_in2)[1],&((__m128i *)idft_in2)[2],&((__m128i *)idft_in2)[3],&((__m128i *)idft_in2)[4],&((__m128i *)idft_in2)[5],&((__m128i *)idft_in2)[6],&((__m128i *)idft_in2)[7],&((__m128i *)idft_in2)[8],&((__m128i *)idft_in2)[9],&((__m128i *)idft_in2)[10],&((__m128i *)idft_in2)[11],
-    &((__m128i *)idft_out2)[0],&((__m128i *)idft_out2)[1],&((__m128i *)idft_out2)[2],&((__m128i *)idft_out2)[3],&((__m128i *)idft_out2)[4],&((__m128i *)idft_out2)[5],&((__m128i *)idft_out2)[6],&((__m128i *)idft_out2)[7],&((__m128i *)idft_out2)[8],&((__m128i *)idft_out2)[9],&((__m128i *)idft_out2)[10],&((__m128i *)idft_out2)[11]);
-    */
-
+#if defined(__x86_64__)||defined(__i386__)
     norm128 = _mm_set1_epi16(9459);
-
+#elif defined(__arm__)
+    norm128 = vdupq_n_s16(9459);
+#endif
     for (i=0; i<12; i++) {
+#if defined(__x86_64__)||defined(__i386__)
       ((__m128i*)idft_out0)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)idft_out0)[i],norm128),1);
       ((__m128i*)idft_out1)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)idft_out1)[i],norm128),1);
       ((__m128i*)idft_out2)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)idft_out2)[i],norm128),1);
+#elif defined(__arm__)
+      ((int16x8_t*)idft_out0)[i] = vqdmulhq_s16(((int16x8_t*)idft_out0)[i],norm128);
+      ((int16x8_t*)idft_out1)[i] = vqdmulhq_s16(((int16x8_t*)idft_out1)[i],norm128);
+      ((int16x8_t*)idft_out2)[i] = vqdmulhq_s16(((int16x8_t*)idft_out2)[i],norm128);
+#endif
     }
 
     break;
@@ -398,6 +420,7 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH)
 
   // conjugate output
   for (i=0; i<(Msc_PUSCH>>2); i++) {
+#if defined(__x86_64__) || defined(__i386__)
     ((__m128i*)z0)[i]=_mm_sign_epi16(((__m128i*)z0)[i],*(__m128i*)&conjugate2[0]);
     ((__m128i*)z1)[i]=_mm_sign_epi16(((__m128i*)z1)[i],*(__m128i*)&conjugate2[0]);
     ((__m128i*)z2)[i]=_mm_sign_epi16(((__m128i*)z2)[i],*(__m128i*)&conjugate2[0]);
@@ -409,12 +432,36 @@ void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,uint32_t *z, uint16_t Msc_PUSCH)
     ((__m128i*)z8)[i]=_mm_sign_epi16(((__m128i*)z8)[i],*(__m128i*)&conjugate2[0]);
     ((__m128i*)z9)[i]=_mm_sign_epi16(((__m128i*)z9)[i],*(__m128i*)&conjugate2[0]);
 
-    if (frame_parms->Ncp==0) {
+    if (frame_parms->Ncp==NORMAL) {
       ((__m128i*)z10)[i]=_mm_sign_epi16(((__m128i*)z10)[i],*(__m128i*)&conjugate2[0]);
       ((__m128i*)z11)[i]=_mm_sign_epi16(((__m128i*)z11)[i],*(__m128i*)&conjugate2[0]);
     }
+#elif defined(__arm__)
+    *&(((int16x8_t*)z0)[i])=vmulq_s16(*&(((int16x8_t*)z0)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z1)[i])=vmulq_s16(*&(((int16x8_t*)z1)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z2)[i])=vmulq_s16(*&(((int16x8_t*)z2)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z3)[i])=vmulq_s16(*&(((int16x8_t*)z3)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z4)[i])=vmulq_s16(*&(((int16x8_t*)z4)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z5)[i])=vmulq_s16(*&(((int16x8_t*)z5)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z6)[i])=vmulq_s16(*&(((int16x8_t*)z6)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z7)[i])=vmulq_s16(*&(((int16x8_t*)z7)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z8)[i])=vmulq_s16(*&(((int16x8_t*)z8)[i]),*(int16x8_t*)&conjugate2[0]);
+    *&(((int16x8_t*)z9)[i])=vmulq_s16(*&(((int16x8_t*)z9)[i]),*(int16x8_t*)&conjugate2[0]);
+
+
+    if (frame_parms->Ncp==NORMAL) {
+      *&(((int16x8_t*)z10)[i])=vmulq_s16(*&(((int16x8_t*)z10)[i]),*(int16x8_t*)&conjugate2[0]);
+      *&(((int16x8_t*)z11)[i])=vmulq_s16(*&(((int16x8_t*)z11)[i]),*(int16x8_t*)&conjugate2[0]);
+    }
+
+#endif
   }
 
+#if defined(__x86_64__) || defined(__i386__)
+  _mm_empty();
+  _m_empty();
+#endif
+
 }
 #endif
 
@@ -429,10 +476,15 @@ int32_t ulsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
                        uint16_t nb_rb,
                        int16_t **llrp)
 {
-
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
-  int32_t i;
   __m128i **llrp128 = (__m128i **)llrp;
+#elif defined(__arm__)
+  int16x8_t *rxF= (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16x8_t **llrp128 = (int16x8_t **)llrp;
+#endif
+
+  int i;
 
   //  printf("qpsk llr for symbol %d (pos %d), llr offset %d\n",symbol,(symbol*frame_parms->N_RB_DL*12),llr128U-(__m128i*)ulsch_llr);
 
@@ -443,8 +495,10 @@ int32_t ulsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
     (*llrp128)++;
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 
   return(0);
 
@@ -458,41 +512,64 @@ void ulsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
                      uint16_t nb_rb,
                      int16_t **llrp)
 {
+int i;
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
   __m128i *ch_mag;
   __m128i mmtmpU0;
   __m128i **llrp128=(__m128i **)llrp;
-
-  int32_t i;
-  //  uint8_t symbol_mod;
-
-  //  printf("ulsch_rx.c: ulsch_16qam_llr: symbol %d\n",symbol);
-
-  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
-
   ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
-
+#elif defined(__arm__)
+  int16x8_t *rxF=(int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16x8_t *ch_mag;
+  int16x8_t xmm0;
+  int16_t **llrp16=llrp;
+  ch_mag =(int16x8_t*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+#endif
 
   for (i=0; i<(nb_rb*3); i++) {
 
-
+#if defined(__x86_64__) || defined(__i386__)
     mmtmpU0 = _mm_abs_epi16(rxF[i]);
     //    print_shorts("tmp0",&tmp0);
 
     mmtmpU0 = _mm_subs_epi16(ch_mag[i],mmtmpU0);
 
-
     (*llrp128)[0] = _mm_unpacklo_epi32(rxF[i],mmtmpU0);
     (*llrp128)[1] = _mm_unpackhi_epi32(rxF[i],mmtmpU0);
     (*llrp128)+=2;
+#elif defined(__arm__)
+    xmm0 = vabsq_s16(rxF[i]);
+    xmm0 = vqsubq_s16(ch_mag[i],xmm0);
+    (*llrp16)[0] = vgetq_lane_s16(rxF[i],0);
+    (*llrp16)[1] = vgetq_lane_s16(xmm0,0);
+    (*llrp16)[2] = vgetq_lane_s16(rxF[i],1);
+    (*llrp16)[3] = vgetq_lane_s16(xmm0,1);
+    (*llrp16)[4] = vgetq_lane_s16(rxF[i],2);
+    (*llrp16)[5] = vgetq_lane_s16(xmm0,2);
+    (*llrp16)[6] = vgetq_lane_s16(rxF[i],2);
+    (*llrp16)[7] = vgetq_lane_s16(xmm0,3);
+    (*llrp16)[8] = vgetq_lane_s16(rxF[i],4);
+    (*llrp16)[9] = vgetq_lane_s16(xmm0,4);
+    (*llrp16)[10] = vgetq_lane_s16(rxF[i],5);
+    (*llrp16)[11] = vgetq_lane_s16(xmm0,5);
+    (*llrp16)[12] = vgetq_lane_s16(rxF[i],6);
+    (*llrp16)[13] = vgetq_lane_s16(xmm0,6);
+    (*llrp16)[14] = vgetq_lane_s16(rxF[i],7);
+    (*llrp16)[15] = vgetq_lane_s16(xmm0,7);
+    (*llrp16)+=16;
+#endif
+
 
     //    print_bytes("rxF[i]",&rxF[i]);
     //    print_bytes("rxF[i+1]",&rxF[i+1]);
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
+#endif
 
 }
 
@@ -505,27 +582,29 @@ void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
                      uint16_t nb_rb,
                      int16_t **llrp)
 {
+  int i;
+  int32_t **llrp32=(int32_t **)llrp;
 
+#if defined(__x86_64__) || defined(__i386)
   __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
   __m128i *ch_mag,*ch_magb;
-  int32_t i;
   __m128i mmtmpU1,mmtmpU2;
-  int32_t **llrp32=(int32_t **)llrp;
-
-  //  uint8_t symbol_mod;
-
-
-
-  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
 
   ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
   ch_magb =(__m128i*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];
+#elif defined(__arm__)
+  int16x8_t *rxF=(int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16x8_t *ch_mag,*ch_magb;
+  int16x8_t mmtmpU1,mmtmpU2;
 
+  ch_mag =(int16x8_t*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+  ch_magb =(int16x8_t*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];
+#endif
   //  printf("symbol %d: mag %d, magb %d\n",symbol,_mm_extract_epi16(ch_mag[0],0),_mm_extract_epi16(ch_magb[0],0));
   for (i=0; i<(nb_rb*3); i++) {
 
 
-
+#if defined(__x86_64__) || defined(__i386__)
     mmtmpU1 = _mm_abs_epi16(rxF[i]);
 
     mmtmpU1  = _mm_subs_epi16(ch_mag[i],mmtmpU1);
@@ -545,12 +624,34 @@ void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
     (*llrp32)[9]  = _mm_extract_epi32(rxF[i],3);
     (*llrp32)[10] = _mm_extract_epi32(mmtmpU1,3);
     (*llrp32)[11] = _mm_extract_epi32(mmtmpU2,3);
+#elif defined(__arm__)
+    mmtmpU1 = vabsq_s16(rxF[i]);
+
+    mmtmpU1 = vqsubq_s16(ch_mag[i],mmtmpU1);
+
+    mmtmpU2 = vabsq_s16(mmtmpU1);
+    mmtmpU2 = vqsubq_s16(ch_magb[i],mmtmpU2);
+
+    (*llrp32)[0]  = vgetq_lane_s32((int32x4_t)rxF[i],0);
+    (*llrp32)[1]  = vgetq_lane_s32((int32x4_t)mmtmpU1,0);
+    (*llrp32)[2]  = vgetq_lane_s32((int32x4_t)mmtmpU2,0);
+    (*llrp32)[3]  = vgetq_lane_s32((int32x4_t)rxF[i],1);
+    (*llrp32)[4]  = vgetq_lane_s32((int32x4_t)mmtmpU1,1);
+    (*llrp32)[5]  = vgetq_lane_s32((int32x4_t)mmtmpU2,1);
+    (*llrp32)[6]  = vgetq_lane_s32((int32x4_t)rxF[i],2);
+    (*llrp32)[7]  = vgetq_lane_s32((int32x4_t)mmtmpU1,2);
+    (*llrp32)[8]  = vgetq_lane_s32((int32x4_t)mmtmpU2,2);
+    (*llrp32)[9]  = vgetq_lane_s32((int32x4_t)rxF[i],3);
+    (*llrp32)[10] = vgetq_lane_s32((int32x4_t)mmtmpU1,3);
+    (*llrp32)[11] = vgetq_lane_s32((int32x4_t)mmtmpU2,3);
+
+#endif
     (*llrp32)+=12;
   }
-
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
@@ -562,13 +663,20 @@ void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
 {
 
 
+#if defined(__x86_64__) || defined(__i386__)
 
   __m128i *rxdataF_comp128_0,*ul_ch_mag128_0,*ul_ch_mag128_0b;
   __m128i *rxdataF_comp128_1,*ul_ch_mag128_1,*ul_ch_mag128_1b;
+#elif defined(__arm__)
 
+  int16x8_t *rxdataF_comp128_0,*ul_ch_mag128_0,*ul_ch_mag128_0b;
+  int16x8_t *rxdataF_comp128_1,*ul_ch_mag128_1,*ul_ch_mag128_1b;
+
+#endif
   int32_t i;
 
   if (frame_parms->nb_antennas_rx>1) {
+#if defined(__x86_64__) || defined(__i386__)
     rxdataF_comp128_0   = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12];
     rxdataF_comp128_1   = (__m128i *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12];
     ul_ch_mag128_0      = (__m128i *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12];
@@ -582,15 +690,31 @@ void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
       ul_ch_mag128_0[i]    = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0[i],1),_mm_srai_epi16(ul_ch_mag128_1[i],1));
       ul_ch_mag128_0b[i]   = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0b[i],1),_mm_srai_epi16(ul_ch_mag128_1b[i],1));
       rxdataF_comp128_0[i] = _mm_add_epi16(rxdataF_comp128_0[i],(*(__m128i*)&jitterc[0]));
-    }
 
-    // remove any bias (DC component after IDFT)
-    //    ((uint32_t*)rxdataF_comp128_0)[0]=0;
+#elif defined(__arm__)
+    rxdataF_comp128_0   = (int16x8_t *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12];
+    rxdataF_comp128_1   = (int16x8_t *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12];
+    ul_ch_mag128_0      = (int16x8_t *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12];
+    ul_ch_mag128_1      = (int16x8_t *)&ul_ch_mag[1][symbol*frame_parms->N_RB_DL*12];
+    ul_ch_mag128_0b     = (int16x8_t *)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12];
+    ul_ch_mag128_1b     = (int16x8_t *)&ul_ch_magb[1][symbol*frame_parms->N_RB_DL*12];
+
+    // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation)
+    for (i=0; i<nb_rb*3; i++) {
+      rxdataF_comp128_0[i] = vhaddq_s16(rxdataF_comp128_0[i],rxdataF_comp128_1[i]);
+      ul_ch_mag128_0[i]    = vhaddq_s16(ul_ch_mag128_0[i],ul_ch_mag128_1[i]);
+      ul_ch_mag128_0b[i]   = vhaddq_s16(ul_ch_mag128_0b[i],ul_ch_mag128_1b[i]);
+      rxdataF_comp128_0[i] = vqaddq_s16(rxdataF_comp128_0[i],(*(int16x8_t*)&jitterc[0]));
+
+
+#endif
+    }
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 void ulsch_extract_rbs_single(int32_t **rxdataF,
@@ -647,9 +771,6 @@ void ulsch_extract_rbs_single(int32_t **rxdataF,
     }
   }
 
-  _mm_empty();
-  _m_empty();
-
 }
 
 void ulsch_correct_ext(int32_t **rxdataF_ext,
@@ -687,42 +808,81 @@ void ulsch_channel_compensation(int32_t **rxdataF_ext,
 {
 
   uint16_t rb;
+
+#if defined(__x86_64__) || defined(__i386__)
+
   __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128;
   uint8_t aarx;//,symbol_mod;
   __m128i mmtmpU0,mmtmpU1,mmtmpU2,mmtmpU3;
 #ifdef OFDMA_ULSCH
   __m128i QAM_amp128U,QAM_amp128bU;
 #endif
-  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
 
-  //    printf("comp: symbol %d\n",symbol);
+#elif defined(__arm__)
+
+  int16x4_t *ul_ch128,*rxdataF128;
+  int16x8_t *ul_ch_mag128,*ul_ch_mag128b,*rxdataF_comp128;
+
+  uint8_t aarx;//,symbol_mod;
+  int32x4_t mmtmpU0,mmtmpU1,mmtmpU0b,mmtmpU1b;
+#ifdef OFDMA_ULSCH
+  int16x8_t mmtmpU2,mmtmpU3;
+  int16x8_t QAM_amp128U,QAM_amp128bU;
+#endif
+  int16_t conj[4]__attribute__((aligned(16))) = {1,-1,1,-1};
+  int32x4_t output_shift128 = vmovq_n_s32(-(int32_t)output_shift);
+
+
 
-#ifdef ULSCH_OFDMA
+#endif
+
+#ifdef OFDMA_ULSCH
 
+#if defined(__x86_64__) || defined(__i386__)
   if (Qm == 4)
     QAM_amp128U = _mm_set1_epi16(QAM16_n1);
   else if (Qm == 6) {
     QAM_amp128U  = _mm_set1_epi16(QAM64_n1);
     QAM_amp128bU = _mm_set1_epi16(QAM64_n2);
   }
+#elif defined(__arm__)
+  if (Qm == 4)
+    QAM_amp128U = vdupq_n_s16(QAM16_n1);
+  else if (Qm == 6) {
+    QAM_amp128U  = vdupq_n_s16(QAM64_n1);
+    QAM_amp128bU = vdupq_n_s16(QAM64_n2);
+  }
 
+#endif
 #endif
 
   for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
 
+#if defined(__x86_64__) || defined(__i386__)
+
     ul_ch128          = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
     ul_ch_mag128      = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12];
     ul_ch_mag128b     = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12];
     rxdataF128        = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
     rxdataF_comp128   = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12];
 
+#elif defined(__arm__)
 
+
+    ul_ch128          = (int16x4_t *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
+    ul_ch_mag128      = (int16x8_t *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12];
+    ul_ch_mag128b     = (int16x8_t *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12];
+    rxdataF128        = (int16x4_t *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
+    rxdataF_comp128   = (int16x8_t *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12];
+
+#endif
     for (rb=0; rb<nb_rb; rb++) {
       //            printf("comp: symbol %d rb %d\n",symbol,rb);
 #ifdef OFDMA_ULSCH
       if (Qm>2) {
         // get channel amplitude if not QPSK
 
+#if defined(__x86_64__) || defined(__i386__)
         mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]);
 
         mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
@@ -761,10 +921,36 @@ void ulsch_channel_compensation(int32_t **rxdataF_ext,
         ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU);
         ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate
 
+#elif defined(__arm__)
+          mmtmpU0 = vmull_s16(ul_ch128[0], ul_ch128[0]);
+          mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128);
+          mmtmpU1 = vmull_s16(ul_ch128[1], ul_ch128[1]);
+          mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128);
+          mmtmpU2 = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1));
+          mmtmpU0 = vmull_s16(ul_ch128[2], ul_ch128[2]);
+          mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128);
+          mmtmpU1 = vmull_s16(ul_ch128[3], ul_ch128[3]);
+          mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128);
+          mmtmpU3 = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1));
+          mmtmpU0 = vmull_s16(ul_ch128[4], ul_ch128[4]);
+          mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128);
+          mmtmpU1 = vmull_s16(ul_ch128[5], ul_ch128[5]);
+          mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128);
+          mmtmpU4 = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1));
+
+          ul_ch_mag128b[0] = vqdmulhq_s16(mmtmpU2,QAM_amp128b);
+          ul_ch_mag128b[1] = vqdmulhq_s16(mmtmpU3,QAM_amp128b);
+          ul_ch_mag128[0] = vqdmulhq_s16(mmtmpU2,QAM_amp128);
+          ul_ch_mag128[1] = vqdmulhq_s16(mmtmpU3,QAM_amp128);
+          ul_ch_mag128b[2] = vqdmulhq_s16(mmtmpU4,QAM_amp128b);
+          ul_ch_mag128[2]  = vqdmulhq_s16(mmtmpU4,QAM_amp128);
+#endif
       }
 
-#else
+#else // SC-FDMA
+// just compute channel magnitude without scaling, this is done after equalization for SC-FDMA
 
+#if defined(__x86_64__) || defined(__i386__)
       mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]);
 
       mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
@@ -784,8 +970,29 @@ void ulsch_channel_compensation(int32_t **rxdataF_ext,
       ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1);
 
       // printf("comp: symbol %d rb %d => %d,%d,%d (output_shift %d)\n",symbol,rb,*((int16_t*)&ul_ch_mag128[0]),*((int16_t*)&ul_ch_mag128[1]),*((int16_t*)&ul_ch_mag128[2]),output_shift);
+
+
+#elif defined(__arm__)
+          mmtmpU0 = vmull_s16(ul_ch128[0], ul_ch128[0]);
+          mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128);
+          mmtmpU1 = vmull_s16(ul_ch128[1], ul_ch128[1]);
+          mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128);
+          ul_ch_mag128[0] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1));
+          mmtmpU0 = vmull_s16(ul_ch128[2], ul_ch128[2]);
+          mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128);
+          mmtmpU1 = vmull_s16(ul_ch128[3], ul_ch128[3]);
+          mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128);
+          ul_ch_mag128[1] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1));
+          mmtmpU0 = vmull_s16(ul_ch128[4], ul_ch128[4]);
+          mmtmpU0 = vqshlq_s32(vqaddq_s32(mmtmpU0,vrev64q_s32(mmtmpU0)),-output_shift128);
+          mmtmpU1 = vmull_s16(ul_ch128[5], ul_ch128[5]);
+          mmtmpU1 = vqshlq_s32(vqaddq_s32(mmtmpU1,vrev64q_s32(mmtmpU1)),-output_shift128);
+          ul_ch_mag128[2] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1));
+
+#endif
 #endif
 
+#if defined(__x86_64__) || defined(__i386__)
       // multiply by conjugated channel
       mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]);
       //        print_ints("re",&mmtmpU0);
@@ -857,21 +1064,81 @@ void ulsch_channel_compensation(int32_t **rxdataF_ext,
       ul_ch_mag128b+=3;
       rxdataF128+=3;
       rxdataF_comp128+=3;
-
+#elif defined(__arm__)
+        mmtmpU0 = vmull_s16(ul_ch128[0], rxdataF128[0]);
+        //mmtmpU0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] 
+        mmtmpU1 = vmull_s16(ul_ch128[1], rxdataF128[1]);
+        //mmtmpU1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] 
+        mmtmpU0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0),vget_high_s32(mmtmpU0)),
+                               vpadd_s32(vget_low_s32(mmtmpU1),vget_high_s32(mmtmpU1)));
+        //mmtmpU0 = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 
+
+        mmtmpU0b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[0],*(int16x4_t*)conj)), rxdataF128[0]);
+        //mmtmpU0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
+        mmtmpU1b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[1],*(int16x4_t*)conj)), rxdataF128[1]);
+        //mmtmpU0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
+        mmtmpU1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0b),vget_high_s32(mmtmpU0b)),
+                               vpadd_s32(vget_low_s32(mmtmpU1b),vget_high_s32(mmtmpU1b)));
+        //mmtmpU1 = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
+
+        mmtmpU0 = vqshlq_s32(mmtmpU0,-output_shift128);
+        mmtmpU1 = vqshlq_s32(mmtmpU1,-output_shift128);
+        rxdataF_comp128[0] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1));
+        mmtmpU0 = vmull_s16(ul_ch128[2], rxdataF128[2]);
+        mmtmpU1 = vmull_s16(ul_ch128[3], rxdataF128[3]);
+        mmtmpU0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0),vget_high_s32(mmtmpU0)),
+                               vpadd_s32(vget_low_s32(mmtmpU1),vget_high_s32(mmtmpU1)));
+        mmtmpU0b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[2],*(int16x4_t*)conj)), rxdataF128[2]);
+        mmtmpU1b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[3],*(int16x4_t*)conj)), rxdataF128[3]);
+        mmtmpU1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0b),vget_high_s32(mmtmpU0b)),
+                               vpadd_s32(vget_low_s32(mmtmpU1b),vget_high_s32(mmtmpU1b)));
+        mmtmpU0 = vqshlq_s32(mmtmpU0,-output_shift128);
+        mmtmpU1 = vqshlq_s32(mmtmpU1,-output_shift128);
+        rxdataF_comp128[1] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1));
+
+        mmtmpU0 = vmull_s16(ul_ch128[4], rxdataF128[4]);
+        mmtmpU1 = vmull_s16(ul_ch128[5], rxdataF128[5]);
+        mmtmpU0 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0),vget_high_s32(mmtmpU0)),
+                               vpadd_s32(vget_low_s32(mmtmpU1),vget_high_s32(mmtmpU1)));
+
+        mmtmpU0b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[4],*(int16x4_t*)conj)), rxdataF128[4]);
+        mmtmpU1b = vmull_s16(vrev32_s16(vmul_s16(ul_ch128[5],*(int16x4_t*)conj)), rxdataF128[5]);
+        mmtmpU1 = vcombine_s32(vpadd_s32(vget_low_s32(mmtmpU0b),vget_high_s32(mmtmpU0b)),
+                               vpadd_s32(vget_low_s32(mmtmpU1b),vget_high_s32(mmtmpU1b)));
+
+              
+        mmtmpU0 = vqshlq_s32(mmtmpU0,-output_shift128);
+        mmtmpU1 = vqshlq_s32(mmtmpU1,-output_shift128);
+        rxdataF_comp128[2] = vcombine_s16(vmovn_s32(mmtmpU0),vmovn_s32(mmtmpU1));
+              
+              // Add a jitter to compensate for the saturation in "packs" resulting in a bias on the DC after IDFT
+        rxdataF_comp128[0] = vqaddq_s16(rxdataF_comp128[0],(*(int16x8_t*)&jitter[0]));
+        rxdataF_comp128[1] = vqaddq_s16(rxdataF_comp128[1],(*(int16x8_t*)&jitter[0]));
+        rxdataF_comp128[2] = vqaddq_s16(rxdataF_comp128[2],(*(int16x8_t*)&jitter[0]));
+
+      
+        ul_ch128+=6;
+        ul_ch_mag128+=3;
+        ul_ch_mag128b+=3;
+        rxdataF128+=6;
+        rxdataF_comp128+=3;
+              
+#endif
     }
   }
 
-
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 
 
 
-
+#if defined(__x86_64__) || defined(__i386__)
 __m128i QAM_amp128U_0,QAM_amp128bU_0,QAM_amp128U_1,QAM_amp128bU_1;
+#endif
 
 void ulsch_channel_compensation_alamouti(int32_t **rxdataF_ext,                 // For Distributed Alamouti Combining
     int32_t **ul_ch_estimates_ext_0,
@@ -888,7 +1155,7 @@ void ulsch_channel_compensation_alamouti(int32_t **rxdataF_ext,
     uint16_t nb_rb,
     uint8_t output_shift)
 {
-
+#if defined(__x86_64__) || defined(__i386__)
   uint16_t rb;
   __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1;
   uint8_t aarx;//,symbol_mod;
@@ -1156,7 +1423,7 @@ void ulsch_channel_compensation_alamouti(int32_t **rxdataF_ext,
 
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 
@@ -1176,6 +1443,7 @@ void ulsch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,// For Distributed Alamouti
                     uint16_t nb_rb)
 {
 
+#if defined(__x86_64__) || defined(__i386__)
   int16_t *rxF,*rxF0,*rxF1;
   __m128i *ch_mag,*ch_magb,*ch_mag0,*ch_mag1,*ch_mag0b,*ch_mag1b;
   uint8_t rb,re,aarx;
@@ -1231,13 +1499,18 @@ void ulsch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,// For Distributed Alamouti
   _mm_empty();
   _m_empty();
 
+#endif
 }
 
 
 
 
 
+#if defined(__x86_64__) || defined(__i386__)
 __m128i avg128U;
+#elif defined(__arm__)
+int32x4_t avg128U;
+#endif
 
 void ulsch_channel_level(int32_t **drs_ch_estimates_ext,
                          LTE_DL_FRAME_PARMS *frame_parms,
@@ -1247,11 +1520,14 @@ void ulsch_channel_level(int32_t **drs_ch_estimates_ext,
 
   int16_t rb;
   uint8_t aarx;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *ul_ch128;
-
-
+#elif defined(__arm__)
+  int16x4_t *ul_ch128;
+#endif
   for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
     //clear average level
+#if defined(__x86_64__) || defined(__i386__)
     avg128U = _mm_setzero_si128();
     ul_ch128=(__m128i *)drs_ch_estimates_ext[aarx];
 
@@ -1263,34 +1539,44 @@ void ulsch_channel_level(int32_t **drs_ch_estimates_ext,
 
       ul_ch128+=3;
 
-      if (rb==0) {
-        //  print_shorts("ul_ch128",&ul_ch128[0]);
-        //  print_shorts("ul_ch128",&ul_ch128[1]);
-        //  print_shorts("ul_ch128",&ul_ch128[2]);
-      }
 
     }
 
+#elif defined(__arm__)
+    avg128U = vdupq_n_s32(0);
+    ul_ch128=(int16x4_t *)drs_ch_estimates_ext[aarx];
+
+    for (rb=0; rb<nb_rb; rb++) {
+
+       avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[0],ul_ch128[0]));
+       avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[1],ul_ch128[1]));
+       avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[2],ul_ch128[2]));
+       avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[3],ul_ch128[3]));
+       avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[4],ul_ch128[4]));
+       avg128U = vqaddq_s32(avg128U,vmull_s16(ul_ch128[5],ul_ch128[5]));
+       ul_ch128+=6;
+
+
+    }
+
+#endif
+
     DevAssert( nb_rb );
     avg[aarx] = (((int*)&avg128U)[0] +
                  ((int*)&avg128U)[1] +
                  ((int*)&avg128U)[2] +
                  ((int*)&avg128U)[3])/(nb_rb*12);
 
-    //    printf("Channel level : %d\n",avg[aarx]);
   }
 
+#if defined(__x86_64__) || defined(__i386__)
   _mm_empty();
   _m_empty();
-
+#endif
 }
 
 int32_t avgU[2];
 int32_t avgU_0[2],avgU_1[2]; // For the Distributed Alamouti Scheme
-/* --> moved to LTE_eNB_PUSCH structure
-int32_t ulsch_power[2];
-int32_t ulsch_power_0[2],ulsch_power_1[2];// For the distributed Alamouti Scheme
-*/
 
 void rx_ulsch(PHY_VARS_eNB *phy_vars_eNB,
               uint32_t sched_subframe,
diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c b/openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c
index 6ecca59ae4..46a49fb793 100644
--- a/openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/ulsch_modulation.c
@@ -49,12 +49,15 @@
 
 //#define DEBUG_ULSCH_MODULATION
 
-__m128i dft_in128[4][1200],dft_in128[4][1200],dft_out128[4][1200],dft_out128[4][1200];
-
 #ifndef OFDMA_ULSCH
 void dft_lte(mod_sym_t *z,mod_sym_t *d, int32_t Msc_PUSCH, uint8_t Nsymb)
 {
 
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i dft_in128[4][1200],dft_out128[4][1200];
+#elif defined(__arm__)
+  int16x8_t dft_in128[4][1200],dft_out128[4][1200];
+#endif
   uint32_t *dft_in0=(uint32_t*)dft_in128[0],*dft_out0=(uint32_t*)dft_out128[0];
   uint32_t *dft_in1=(uint32_t*)dft_in128[1],*dft_out1=(uint32_t*)dft_out128[1];
   uint32_t *dft_in2=(uint32_t*)dft_in128[2],*dft_out2=(uint32_t*)dft_out128[2];
@@ -64,8 +67,11 @@ void dft_lte(mod_sym_t *z,mod_sym_t *d, int32_t Msc_PUSCH, uint8_t Nsymb)
 
   uint32_t *z0,*z1,*z2,*z3,*z4,*z5,*z6,*z7,*z8,*z9,*z10,*z11;
   uint32_t i,ip;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i norm128;
-
+#elif defined(__arm__)
+  int16x8_t norm128;
+#endif
   //  msg("Doing lte_dft for Msc_PUSCH %d\n",Msc_PUSCH);
 
   d0 = (uint32_t *)d;
@@ -119,12 +125,21 @@ void dft_lte(mod_sym_t *z,mod_sym_t *d, int32_t Msc_PUSCH, uint8_t Nsymb)
     dft12f(&((__m128i *)dft_in2)[0],&((__m128i *)dft_in2)[1],&((__m128i *)dft_in2)[2],&((__m128i *)dft_in2)[3],&((__m128i *)dft_in2)[4],&((__m128i *)dft_in2)[5],&((__m128i *)dft_in2)[6],&((__m128i *)dft_in2)[7],&((__m128i *)dft_in2)[8],&((__m128i *)dft_in2)[9],&((__m128i *)dft_in2)[10],&((__m128i *)dft_in2)[11],
     &((__m128i *)dft_out2)[0],&((__m128i *)dft_out2)[1],&((__m128i *)dft_out2)[2],&((__m128i *)dft_out2)[3],&((__m128i *)dft_out2)[4],&((__m128i *)dft_out2)[5],&((__m128i *)dft_out2)[6],&((__m128i *)dft_out2)[7],&((__m128i *)dft_out2)[8],&((__m128i *)dft_out2)[9],&((__m128i *)dft_out2)[10],&((__m128i *)dft_out2)[11]);
     */
+#if defined(__x86_64__) || defined(__i386__)
     norm128 = _mm_set1_epi16(9459);
-
+#elif defined(__arm__)
+    norm128 = vdupq_n_s16(9459);
+#endif
     for (i=0; i<12; i++) {
+#if defined(__x86_64__) || defined(__i386__)
       ((__m128i*)dft_out0)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)dft_out0)[i],norm128),1);
       ((__m128i*)dft_out1)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)dft_out1)[i],norm128),1);
       ((__m128i*)dft_out2)[i] = _mm_slli_epi16(_mm_mulhi_epi16(((__m128i*)dft_out2)[i],norm128),1);
+#elif defined(__arm__)
+      ((int16x8_t*)dft_out0)[i] = vqdmulhq_s16(((int16x8_t*)dft_out0)[i],norm128);
+      ((int16x8_t*)dft_out1)[i] = vqdmulhq_s16(((int16x8_t*)dft_out1)[i],norm128);
+      ((int16x8_t*)dft_out2)[i] = vqdmulhq_s16(((int16x8_t*)dft_out2)[i],norm128);
+#endif
     }
 
     break;
diff --git a/openair1/PHY/MODULATION/slot_fep.c b/openair1/PHY/MODULATION/slot_fep.c
index da313c9b32..c591c7b528 100644
--- a/openair1/PHY/MODULATION/slot_fep.c
+++ b/openair1/PHY/MODULATION/slot_fep.c
@@ -35,12 +35,19 @@
 
 void rescale(int16_t *input,int length)
 {
-
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *input128 = (__m128i *)input;
+#elif defined(__arm__)
+  int16x8_t *input128 = (int16x8_t *)input;
+#endif
   int i;
 
   for (i=0; i<length>>2; i++) {
+#if defined(__x86_64__) || defined(__i386__)
     input128[i] = _mm_srai_epi16(input128[i],4);
+#elif defined(__arm__)
+    input128[i] = vshrq_n_s16(input128[i],4);
+#endif
   }
 }
 
diff --git a/openair1/PHY/MODULATION/ul_7_5_kHz.c b/openair1/PHY/MODULATION/ul_7_5_kHz.c
index 34e62e0f98..0e7c1785fa 100755
--- a/openair1/PHY/MODULATION/ul_7_5_kHz.c
+++ b/openair1/PHY/MODULATION/ul_7_5_kHz.c
@@ -48,7 +48,13 @@ void apply_7_5_kHz(PHY_VARS_UE *phy_vars_ue,int32_t*txdata,uint8_t slot)
 
   uint16_t len;
   uint32_t *kHz7_5ptr;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *txptr128,*kHz7_5ptr128,mmtmp_re,mmtmp_im,mmtmp_re2,mmtmp_im2;
+#elif defined(__arm__)
+  int16x8_t *txptr128,*kHz7_5ptr128;
+  int32x4_t mmtmp_re,mmtmp_im;
+  int32x4_t mmtmp0,mmtmp1;
+#endif
   uint32_t slot_offset;
   //   uint8_t aa;
   uint32_t i;
@@ -90,13 +96,17 @@ void apply_7_5_kHz(PHY_VARS_UE *phy_vars_ue,int32_t*txdata,uint8_t slot)
   //    slot_offset += (len/4);
   len = phy_vars_ue->lte_frame_parms.samples_per_tti/2;
 
-  //for (aa=0;aa<phy_vars_ue->lte_frame_parms.nb_antennas_tx;aa++) {
+#if defined(__x86_64__) || defined(__i386__)
   txptr128 = (__m128i *)&txdata[slot_offset];
   kHz7_5ptr128 = (__m128i *)kHz7_5ptr;
+#elif defined(__arm__)
+  txptr128 = (int16x8_t*)&txdata[slot_offset];
+  kHz7_5ptr128 = (int16x8_t*)kHz7_5ptr;
+#endif
   // apply 7.5 kHz
 
-  //      if (((slot>>1)&1) == 0) { // apply the sinusoid from the table directly
   for (i=0; i<(len>>2); i++) {
+#if defined(__x86_64__) || defined(__i386__)
     mmtmp_re = _mm_madd_epi16(*txptr128,*kHz7_5ptr128);
     // Real part of complex multiplication (note: 7_5kHz signal is conjugated for this to work)
     mmtmp_im = _mm_shufflelo_epi16(*kHz7_5ptr128,_MM_SHUFFLE(2,3,0,1));
@@ -107,39 +117,32 @@ void apply_7_5_kHz(PHY_VARS_UE *phy_vars_ue,int32_t*txdata,uint8_t slot)
     mmtmp_im = _mm_srai_epi32(mmtmp_im,15);
     mmtmp_re2 = _mm_unpacklo_epi32(mmtmp_re,mmtmp_im);
     mmtmp_im2 = _mm_unpackhi_epi32(mmtmp_re,mmtmp_im);
-    /*
-    printf("%d: (%d,%d) (%d,%d) (%d,%d) (%d,%d) x (%d,%d) (%d,%d) (%d,%d) (%d,%d) => ",
-    i,
-    ((short*)txptr128)[0],
-    ((short*)txptr128)[1],
-    ((short*)txptr128)[2],
-    ((short*)txptr128)[3],
-    ((short*)txptr128)[4],
-    ((short*)txptr128)[5],
-    ((short*)txptr128)[6],
-    ((short*)txptr128)[7],
-    ((short*)kHz7_5ptr128)[0],
-    ((short*)kHz7_5ptr128)[1],
-    ((short*)kHz7_5ptr128)[2],
-    ((short*)kHz7_5ptr128)[3],
-    ((short*)kHz7_5ptr128)[4],
-    ((short*)kHz7_5ptr128)[5],
-    ((short*)kHz7_5ptr128)[6],
-    ((short*)kHz7_5ptr128)[7]);*/
 
     txptr128[0] = _mm_packs_epi32(mmtmp_re2,mmtmp_im2);
-    /*    printf("%(%d,%d) (%d,%d) (%d,%d) (%d,%d)\n",
-    ((short*)txptr128)[0],
-    ((short*)txptr128)[1],
-    ((short*)txptr128)[2],
-    ((short*)txptr128)[3],
-    ((short*)txptr128)[4],
-    ((short*)txptr128)[5],
-    ((short*)txptr128)[6],
-    ((short*)txptr128)[7]);*/
-
+    txptr128++;
+    kHz7_5ptr128++;  
+#elif defined(__arm__)
+
+    mmtmp0 = vmull_s16(((int16x4_t*)txptr128)[0],((int16x4_t*)kHz7_5ptr128)[0]);
+        //mmtmp0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])] 
+    mmtmp1 = vmull_s16(((int16x4_t*)txptr128)[1],((int16x4_t*)kHz7_5ptr128)[1]);
+        //mmtmp1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])] 
+    mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                            vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+        //mmtmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 
+
+    mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)txptr128)[0],*(int16x4_t*)conjugate75_2)),((int16x4_t*)kHz7_5ptr128)[0]);
+        //mmtmp0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
+    mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)txptr128)[1],*(int16x4_t*)conjugate75_2)), ((int16x4_t*)kHz7_5ptr128)[1]);
+        //mmtmp0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
+    mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                            vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+        //mmtmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
+
+    txptr128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
     txptr128++;
     kHz7_5ptr128++;
+#endif
   }
 
   //}
@@ -154,7 +157,14 @@ void remove_7_5_kHz(PHY_VARS_eNB *phy_vars_eNB,uint8_t slot)
   int32_t **rxdata_7_5kHz=phy_vars_eNB->lte_eNB_common_vars.rxdata_7_5kHz[0];
   uint16_t len;
   uint32_t *kHz7_5ptr;
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *rxptr128,*rxptr128_7_5kHz,*kHz7_5ptr128,kHz7_5_2,mmtmp_re,mmtmp_im,mmtmp_re2,mmtmp_im2;
+#elif defined(__arm__)
+  int16x8_t *rxptr128,*kHz7_5ptr128,*rxptr128_7_5kHz;
+  int32x4_t mmtmp_re,mmtmp_im;
+  int32x4_t mmtmp0,mmtmp1;
+
+#endif
   uint32_t slot_offset,slot_offset2;
   uint8_t aa;
   uint32_t i;
@@ -199,14 +209,21 @@ void remove_7_5_kHz(PHY_VARS_eNB *phy_vars_eNB,uint8_t slot)
 
   for (aa=0; aa<phy_vars_eNB->lte_frame_parms.nb_antennas_rx; aa++) {
 
+#if defined(__x86_64__) || defined(__i386__)
     rxptr128        = (__m128i *)&rxdata[aa][slot_offset];
     rxptr128_7_5kHz = (__m128i *)&rxdata_7_5kHz[aa][slot_offset2];
     kHz7_5ptr128    = (__m128i *)kHz7_5ptr;
-
+#elif defined(__arm__)
+    rxptr128        = (int16x8_t *)&rxdata[aa][slot_offset];
+    rxptr128_7_5kHz = (int16x8_t *)&rxdata_7_5kHz[aa][slot_offset2];
+    kHz7_5ptr128    = (int16x8_t *)kHz7_5ptr;
+#endif
     // apply 7.5 kHz
 
     //      if (((slot>>1)&1) == 0) { // apply the sinusoid from the table directly
     for (i=0; i<(len>>2); i++) {
+
+#if defined(__x86_64__) || defined(__i386__)
       kHz7_5_2 = _mm_sign_epi16(*kHz7_5ptr128,*(__m128i*)&conjugate75_2[0]);
       mmtmp_re = _mm_madd_epi16(*rxptr128,kHz7_5_2);
       // Real part of complex multiplication (note: 7_5kHz signal is conjugated for this to work)
@@ -223,350 +240,33 @@ void remove_7_5_kHz(PHY_VARS_eNB *phy_vars_eNB,uint8_t slot)
       rxptr128++;
       rxptr128_7_5kHz++;
       kHz7_5ptr128++;
-    }
-  }
-}
-
-
-
-void apply_625_Hz(PHY_VARS_UE *phy_vars_ue,int16_t *prach)
-{
-
-  uint32_t *Hz625ptr;
-  __m128i *txptr128,*Hz625ptr128,mmtmp_re,mmtmp_im,mmtmp_re2,mmtmp_im2;
-  uint8_t aa;
-  uint32_t Ncp,len;
-  uint32_t i;
-  LTE_DL_FRAME_PARMS *frame_parms=&phy_vars_ue->lte_frame_parms;
-  uint8_t frame_type         = phy_vars_ue->lte_frame_parms.frame_type;
-  uint8_t prach_ConfigIndex  = phy_vars_ue->lte_frame_parms.prach_config_common.prach_ConfigInfo.prach_ConfigIndex;
-  uint8_t prach_fmt = get_prach_fmt(prach_ConfigIndex,frame_type);
-
-  switch (prach_fmt) {
-  case 0:
-    Ncp = 3168;
-    break;
-
-  case 1:
-  case 3:
-    Ncp = 21024;
-    break;
-
-  case 2:
-    Ncp = 6240;
-    break;
-
-  case 4:
-    Ncp = 448;
-    break;
-
-  default:
-    Ncp = 3168;
-    break;
-  }
-
-  switch (frame_parms->N_RB_UL) {
-
-  case 6:
-    Hz625ptr = (uint32_t*)sig625_1_25MHz;
-    len = 1536 + (Ncp>>4);
-    break;
-
-  case 15:
-    Hz625ptr = (uint32_t*)sig625_2_5MHz;
-    len = 3072 + (Ncp>>3);
-    break;
-
-  case 25:
-    Hz625ptr = (uint32_t*)sig625_5MHz;
-    len = 6144+(Ncp>>2);
-    break;
-
-  case 50:
-    Hz625ptr = (uint32_t*)sig625_10MHz;
-    len = 12288+(Ncp>>1);
-    break;
-
-  case 75:
-    Hz625ptr = (uint32_t*)sig625_15MHz;
-    len = 18432+((2*Ncp)/3);
-    break;
-
-  case 100:
-    Hz625ptr = (uint32_t*)sig625_20MHz;
-    len = 24576+Ncp;
-    break;
-
-  default:
-    Hz625ptr = (uint32_t*)sig625_5MHz;
-    len = 6144+(Ncp>>2);
-    break;
-  }
-
-  for (aa=0; aa<phy_vars_ue->lte_frame_parms.nb_antennas_tx; aa++) {
-    txptr128 = (__m128i *)prach;
-    Hz625ptr128 = (__m128i *)Hz625ptr;
-    // apply 7.5 kHz
-
-    //      if (((slot>>1)&1) == 0) { // apply the sinusoid from the table directly
-    for (i=0; i<(len>>2); i++) {
-      mmtmp_re = _mm_madd_epi16(*txptr128,*Hz625ptr128);
-      // Real part of complex multiplication (note: 7_5kHz signal is conjugated for this to work)
-      mmtmp_im = _mm_shufflelo_epi16(*Hz625ptr128,_MM_SHUFFLE(2,3,0,1));
-      mmtmp_im = _mm_shufflehi_epi16(mmtmp_im,_MM_SHUFFLE(2,3,0,1));
-      mmtmp_im = _mm_sign_epi16(mmtmp_im,*(__m128i*)&conjugate75[0]);
-      mmtmp_im = _mm_madd_epi16(mmtmp_im,txptr128[0]);
-      mmtmp_re = _mm_srai_epi32(mmtmp_re,15);
-      mmtmp_im = _mm_srai_epi32(mmtmp_im,15);
-      mmtmp_re2 = _mm_unpacklo_epi32(mmtmp_re,mmtmp_im);
-      mmtmp_im2 = _mm_unpackhi_epi32(mmtmp_re,mmtmp_im);
-      /*
-      printf("%d: (%d,%d) (%d,%d) (%d,%d) (%d,%d) x (%d,%d) (%d,%d) (%d,%d) (%d,%d) => ",
-      i,
-      ((short*)txptr128)[0],
-      ((short*)txptr128)[1],
-      ((short*)txptr128)[2],
-      ((short*)txptr128)[3],
-      ((short*)txptr128)[4],
-      ((short*)txptr128)[5],
-      ((short*)txptr128)[6],
-      ((short*)txptr128)[7],
-      ((short*)Hz625ptr128)[0],
-      ((short*)Hz625ptr128)[1],
-      ((short*)Hz625ptr128)[2],
-      ((short*)Hz625ptr128)[3],
-      ((short*)Hz625ptr128)[4],
-      ((short*)Hz625ptr128)[5],
-      ((short*)Hz625ptr128)[6],
-      ((short*)Hz625ptr128)[7]);*/
-
-      txptr128[0] = _mm_packs_epi32(mmtmp_re2,mmtmp_im2);
-      /*    printf("%(%d,%d) (%d,%d) (%d,%d) (%d,%d)\n",
-      ((short*)txptr128)[0],
-      ((short*)txptr128)[1],
-      ((short*)txptr128)[2],
-      ((short*)txptr128)[3],
-      ((short*)txptr128)[4],
-      ((short*)txptr128)[5],
-      ((short*)txptr128)[6],
-      ((short*)txptr128)[7]);*/
-
-      txptr128++;
-      Hz625ptr128++;
-    }
-  }
-}
-
-void remove_625_Hz(PHY_VARS_eNB *phy_vars_eNB,int16_t *prach)
-{
-
-  uint32_t *Hz625ptr;
-  __m128i *txptr128,*Hz625ptr128,Hz625_2,mmtmp_re,mmtmp_im,mmtmp_re2,mmtmp_im2;
-  uint8_t aa;
-  uint32_t i,Ncp,len;
-  LTE_DL_FRAME_PARMS *frame_parms=&phy_vars_eNB->lte_frame_parms;
-  uint8_t frame_type         = frame_parms->frame_type;
-  uint8_t prach_ConfigIndex  = frame_parms->prach_config_common.prach_ConfigInfo.prach_ConfigIndex;
-  uint8_t prach_fmt = get_prach_fmt(prach_ConfigIndex,frame_type);
-
-  switch (prach_fmt) {
-  case 0:
-    Ncp = 3168;
-    break;
-
-  case 1:
-  case 3:
-    Ncp = 21024;
-    break;
-
-  case 2:
-    Ncp = 6240;
-    break;
-
-  case 4:
-    Ncp = 448;
-    break;
 
-  default:
-    Ncp = 3168;
-    break;
-  }
-
-  switch (frame_parms->N_RB_UL) {
-
-  case 6:
-    Hz625ptr = (uint32_t*)sig625_1_25MHz;
-    len = 1536 + (Ncp>>4);
-    break;
-
-  case 15:
-    Hz625ptr = (uint32_t*)sig625_2_5MHz;
-    len = 3072 + (Ncp>>3) ;
-    break;
-
-  case 25:
-    Hz625ptr = (uint32_t*)sig625_5MHz;
-    len = 6144+(Ncp>>2);
-    break;
-
-  case 50:
-    Hz625ptr = (uint32_t*)sig625_10MHz;
-    len = 12288+(Ncp>>1);
-    break;
-
-  case 75:
-    Hz625ptr = (uint32_t*)sig625_15MHz;
-    len = 18432+((2*Ncp)/3);
-    break;
-
-  case 100:
-    Hz625ptr = (uint32_t*)sig625_20MHz;
-    len = 24576+Ncp;
-    break;
-
-  default:
-    Hz625ptr = (uint32_t*)sig625_5MHz;
-    len = 11400;
-    break;
-  }
+#elif defined(__arm__)
+
+      kHz7_5ptr128[0] = vmulq_s16(kHz7_5ptr128[0],((int16x8_t*)conjugate75_2)[0]);
+      mmtmp0 = vmull_s16(((int16x4_t*)rxptr128)[0],((int16x4_t*)kHz7_5ptr128)[0]);
+        //mmtmp0 = [Re(ch[0])Re(rx[0]) Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1]) Im(ch[1])Im(ch[1])]
+      mmtmp1 = vmull_s16(((int16x4_t*)rxptr128)[1],((int16x4_t*)kHz7_5ptr128)[1]);
+        //mmtmp1 = [Re(ch[2])Re(rx[2]) Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3]) Im(ch[3])Im(ch[3])]
+      mmtmp_re = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+        //mmtmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])]
+
+      mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)rxptr128)[0],*(int16x4_t*)conjugate75_2)), ((int16x4_t*)kHz7_5ptr128)[0]);
+        //mmtmp0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
+      mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)rxptr128)[1],*(int16x4_t*)conjugate75_2)), ((int16x4_t*)kHz7_5ptr128)[1]);
+        //mmtmp1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
+      mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
+                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
+        //mmtmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
+
+      rxptr128_7_5kHz[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
+      rxptr128_7_5kHz++;
+      rxptr128++;
+      kHz7_5ptr128++;
 
-  for (aa=0; aa<phy_vars_eNB->lte_frame_parms.nb_antennas_tx; aa++) {
-    txptr128 = (__m128i *)prach;
-    Hz625ptr128 = (__m128i *)Hz625ptr;
-    // apply 7.5 kHz
 
-    //      if (((slot>>1)&1) == 0) { // apply the sinusoid from the table directly
-    for (i=0; i<(len>>2); i++) {
-      Hz625_2 = _mm_sign_epi16(*Hz625ptr128,*(__m128i*)&conjugate75_2[0]);
-      mmtmp_re = _mm_madd_epi16(*txptr128,Hz625_2);
-      // Real part of complex multiplication (note: 7_5kHz signal is conjugated for this to work)
-      mmtmp_im = _mm_shufflelo_epi16(Hz625_2,_MM_SHUFFLE(2,3,0,1));
-      mmtmp_im = _mm_shufflehi_epi16(mmtmp_im,_MM_SHUFFLE(2,3,0,1));
-      mmtmp_im = _mm_sign_epi16(mmtmp_im,*(__m128i*)&conjugate75[0]);
-      mmtmp_im = _mm_madd_epi16(mmtmp_im,txptr128[0]);
-      mmtmp_re = _mm_srai_epi32(mmtmp_re,15);
-      mmtmp_im = _mm_srai_epi32(mmtmp_im,15);
-      mmtmp_re2 = _mm_unpacklo_epi32(mmtmp_re,mmtmp_im);
-      mmtmp_im2 = _mm_unpackhi_epi32(mmtmp_re,mmtmp_im);
-      /*
-      printf("%d: (%d,%d) (%d,%d) (%d,%d) (%d,%d) x (%d,%d) (%d,%d) (%d,%d) (%d,%d) => ",
-      i,
-      ((short*)txptr128)[0],
-      ((short*)txptr128)[1],
-      ((short*)txptr128)[2],
-      ((short*)txptr128)[3],
-      ((short*)txptr128)[4],
-      ((short*)txptr128)[5],
-      ((short*)txptr128)[6],
-      ((short*)txptr128)[7],
-      ((short*)Hz625ptr128)[0],
-      ((short*)Hz625ptr128)[1],
-      ((short*)Hz625ptr128)[2],
-      ((short*)Hz625ptr128)[3],
-      ((short*)Hz625ptr128)[4],
-      ((short*)Hz625ptr128)[5],
-      ((short*)Hz625ptr128)[6],
-      ((short*)Hz625ptr128)[7]);*/
-
-      txptr128[0] = _mm_packs_epi32(mmtmp_re2,mmtmp_im2);
-      /*    printf("%(%d,%d) (%d,%d) (%d,%d) (%d,%d)\n",
-      ((short*)txptr128)[0],
-      ((short*)txptr128)[1],
-      ((short*)txptr128)[2],
-      ((short*)txptr128)[3],
-      ((short*)txptr128)[4],
-      ((short*)txptr128)[5],
-      ((short*)txptr128)[6],
-      ((short*)txptr128)[7]);*/
-
-      txptr128++;
-      Hz625ptr128++;
+#endif
     }
   }
 }
-
-
-void init_prach625(LTE_DL_FRAME_PARMS *frame_parms)
-{
-
-  uint32_t len,i,Ncp;
-  double fs;
-  int16_t *Hz625ptr;
-  uint8_t frame_type         = frame_parms->frame_type;
-  uint8_t prach_ConfigIndex  = frame_parms->prach_config_common.prach_ConfigInfo.prach_ConfigIndex;
-  uint8_t prach_fmt = get_prach_fmt(prach_ConfigIndex,frame_type);
-
-  switch (prach_fmt) {
-  case 0:
-    Ncp = 3168;
-    break;
-
-  case 1:
-  case 3:
-    Ncp = 21024;
-    break;
-
-  case 2:
-    Ncp = 6240;
-    break;
-
-  case 4:
-    Ncp = 448;
-    break;
-
-  default:
-    Ncp = 3168;
-    break;
-  }
-
-  switch (frame_parms->N_RB_UL) {
-  case 6:
-    len = 1536 + (Ncp>>4);
-    fs = 1920000.0;
-    Hz625ptr = sig625_1_25MHz;
-    break;
-
-  case 15:
-    len = 3072 + (Ncp>>3) ;
-    fs = 3840000.0;
-    Hz625ptr = sig625_2_5MHz;
-    break;
-
-  case 25:
-    len = 6144+(Ncp>>2);
-    fs = 7680000.0;
-    Hz625ptr = sig625_5MHz;
-    break;
-
-  case 50:
-    len = 12288+(Ncp>>1);
-    fs = 15360000.0;
-    Hz625ptr = sig625_10MHz;
-    break;
-
-  case 75:
-    len = 18432+((2*Ncp)/3);
-    fs = 23040000.0;
-    Hz625ptr = sig625_15MHz;
-    break;
-
-  case 100:
-    len = 24576+Ncp;
-    fs = 30720000.0;
-    Hz625ptr = sig625_20MHz;
-    break;
-
-  default:
-    len = 6144+(Ncp>>2);
-    fs = 7680000.0;
-    Hz625ptr = sig625_5MHz;
-    break;
-  }
-
-  for (i=0; i<len; i++) {
-    Hz625ptr[i<<1]     = (int16_t)floor(32767.0*cos(2*M_PI*625*i/fs));
-    Hz625ptr[1+(i<<1)] = (int16_t)floor(32767.0*sin(2*M_PI*625*i/fs));
-    //    printf("prach625 %d: (%d,%d)\n",i,Hz625ptr[i<<1],Hz625ptr[1+(i<<1)]);
-  }
-
-}
diff --git a/openair1/PHY/TOOLS/cdot_prod.c b/openair1/PHY/TOOLS/cdot_prod.c
index b6a4095ffe..ee8425fd1a 100644
--- a/openair1/PHY/TOOLS/cdot_prod.c
+++ b/openair1/PHY/TOOLS/cdot_prod.c
@@ -43,10 +43,12 @@ int32_t dot_product(int16_t *x,
                     uint8_t output_shift)
 {
 
+  uint32_t n;
+
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im;
   __m64 mmtmp7;
   __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
-  uint32_t n;
   int32_t result;
 
   x128 = (__m128i*) x;
@@ -113,11 +115,54 @@ int32_t dot_product(int16_t *x,
   // convert back to integer
   result = _mm_cvtsi64_si32(mmtmp7);
 
+  return(result);
+
   _mm_empty();
   _m_empty();
 
+#elif defined(__arm__)
+  int16x4_t *x_128=(int16x4_t*)x;
+  int16x4_t *y_128=(int16x4_t*)y;
+  int32x4_t tmp_re,tmp_im;
+  int32x4_t tmp_re1,tmp_im1;
+  int32x4_t re_cumul,im_cumul;
+  int32x2_t re_cumul2,im_cumul2;
+  int32x4_t shift = vdupq_n_s32(-output_shift); 
+  int32x2x2_t result2;
+  int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;
 
-  return(result);
+  re_cumul = vdupq_n_s32(0);
+  im_cumul = vdupq_n_s32(0); 
+
+  for (n=0; n<(N>>2); n++) {
+
+    tmp_re  = vmull_s16(*x_128++, *y_128++);
+    //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] 
+    tmp_re1 = vmull_s16(*x_128++, *y_128++);
+    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] 
+    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
+                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
+    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 
+
+    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
+    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
+    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
+    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
+    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
+                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
+    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
+
+    re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift));
+    im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift));
+  }
+  
+  re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul));
+  im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul));
+  re_cumul2 = vpadd_s32(re_cumul2,re_cumul2);
+  im_cumul2 = vpadd_s32(im_cumul2,im_cumul2);
+  result2   = vzip_s32(re_cumul2,im_cumul2);
+  return(vget_lane_s32(result2.val[0],0));
+#endif
 }
 
 
diff --git a/openair1/PHY/TOOLS/cmult_sv.c b/openair1/PHY/TOOLS/cmult_sv.c
index 964ae43fa9..c9d3a8a50e 100644
--- a/openair1/PHY/TOOLS/cmult_sv.c
+++ b/openair1/PHY/TOOLS/cmult_sv.c
@@ -29,9 +29,27 @@
 #include "PHY/sse_intrin.h"
 #include "defs.h"
 
-#ifndef EXPRESSMIMO_TARGET
-static  __m128i alpha_128 __attribute__ ((aligned(16)));
-static  __m128i shift     __attribute__ ((aligned(16)));
+#if defined(__x86_64__) || defined(__i386__)
+#define simd_q15_t __m128i
+#define simdshort_q15_t __m64
+#define shiftright_int16(a,shift) _mm_srai_epi16(a,shift)
+#define set1_int16(a) _mm_set1_epi16(a)
+#define mulhi_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),1)
+#define mulhi_s1_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),2)
+#define adds_int16(a,b) _mm_adds_epi16(a,b)
+#define mullo_int16(a,b) _mm_mullo_epi16(a,b)
+#elif defined(__arm__)
+#define simd_q15_t int16x8_t
+#define simdshort_q15_t int16x4_t
+#define shiftright_int16(a,shift) vshrq_n_s16(a,shift)
+#define set1_int16(a) vdupq_n_s16(a)
+#define mulhi_int16(a,b) vqdmulhq_s16(a,b)
+#define mulhi_s1_int16(a,b) vshlq_n_s16(vqdmulhq_s16(a,b),1)
+#define adds_int16(a,b) vqaddq_s16(a,b)
+#define mullo_int16(a,b) vmulq_s16(a,b)
+#define _mm_empty() 
+#define _m_empty()
+#endif
 
 
 void multadd_complex_vector_real_scalar(int16_t *x,
@@ -41,19 +59,19 @@ void multadd_complex_vector_real_scalar(int16_t *x,
                                         uint32_t N)
 {
 
-  __m128i alpha_128,*x_128=(__m128i*)x,*y_128=(__m128i*)y;
+  simd_q15_t alpha_128,*x_128=(simd_q15_t *)x,*y_128=(simd_q15_t*)y;
   int n;
 
-  alpha_128 = _mm_set1_epi16(alpha);
+  alpha_128 = set1_int16(alpha);
 
   if (zero_flag == 1)
     for (n=0; n<N>>2; n++) {
-      y_128[n] = _mm_slli_epi16(_mm_mulhi_epi16(x_128[n],alpha_128),1);
+      y_128[n] = mulhi_int16(x_128[n],alpha_128);
     }
 
   else
     for (n=0; n<N>>2; n++) {
-      y_128[n] = _mm_adds_epi16(y_128[n],_mm_slli_epi16(_mm_mulhi_epi16(x_128[n],alpha_128),1));
+      y_128[n] = adds_int16(y_128[n],mulhi_int16(x_128[n],alpha_128));
     }
 
   _mm_empty();
@@ -69,32 +87,33 @@ void multadd_real_vector_complex_scalar(int16_t *x,
   uint32_t i;
 
   // do 8 multiplications at a time
-  __m128i alpha_r_128,alpha_i_128,yr,yi,*x_128=(__m128i*)x,*y_128=(__m128i*)y;
+  simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x,*y_128=(simd_q15_t*)y;
   int j;
 
-
   //  printf("alpha = %d,%d\n",alpha[0],alpha[1]);
-  alpha_r_128 = _mm_set_epi16(alpha[0],alpha[0],alpha[0],alpha[0],alpha[0],alpha[0],alpha[0],alpha[0]);
-  alpha_i_128 = _mm_set_epi16(alpha[1],alpha[1],alpha[1],alpha[1],alpha[1],alpha[1],alpha[1],alpha[1]);
-
+  alpha_r_128 = set1_int16(alpha[0]);
+  alpha_i_128 = set1_int16(alpha[1]);
 
   j=0;
 
   for (i=0; i<N>>3; i++) {
 
-    yr     = _mm_slli_epi16(_mm_mulhi_epi16(alpha_r_128,x_128[i]),2);
-    yi     = _mm_slli_epi16(_mm_mulhi_epi16(alpha_i_128,x_128[i]),2);
-
-    //    print_shorts("yr",&yr);
-    //    print_shorts("yi",&yi);
-
+    yr     = mulhi_s1_int16(alpha_r_128,x_128[i]);
+    yi     = mulhi_s1_int16(alpha_i_128,x_128[i]);
+#if defined(__x86_64__) || defined(__i386__)
     y_128[j]   = _mm_adds_epi16(y_128[j],_mm_unpacklo_epi16(yr,yi));
-    //    print_shorts("y",&y_128[j]);
     j++;
     y_128[j]   = _mm_adds_epi16(y_128[j],_mm_unpackhi_epi16(yr,yi));
-    //    print_shorts("y",&y_128[j]);
     j++;
-
+#elif defined(__arm__)
+    int16x8x2_t yint;
+    yint = vzipq_s16(yr,yi);
+    y_128[j]   = adds_int16(y_128[j],yint.val[0]);
+    j++;
+    y_128[j]   = adds_int16(y_128[j],yint.val[1]);
+ 
+    j++;
+#endif
   }
 
   _mm_empty();
@@ -102,6 +121,7 @@ void multadd_real_vector_complex_scalar(int16_t *x,
 
 }
 
+/*
 int rotate_cpx_vector(int16_t *x,
                       int16_t *alpha,
                       int16_t *y,
@@ -127,23 +147,10 @@ int rotate_cpx_vector(int16_t *x,
 
   register __m128i m0,m1;
 
-  //    short *temps;
-  //    int *tempd;
-
 
 
   __m128i *x_128;
   __m128i *y_128;
-  //  __m128i temp;
-
-  /*
-  msg("rotate_cpx_vector: %x,%x,%x,%d,%d\n",
-      x,
-      alpha,
-      y,
-      N,
-      output_shift);
-  */
 
 
   shift = _mm_cvtsi32_si128(output_shift);
@@ -177,111 +184,44 @@ int rotate_cpx_vector(int16_t *x,
   // we compute 4 cpx multiply for each loop
   for(i=0; i<(N>>3); i++) {
 
-    //    printf("i=%d\n",i);
-    /*
-        temps = (short *)x_128;
-        printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-        temps = (int16_t *)&alpha_128;
-        printf("alpha : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    */
-
     m0 = _mm_madd_epi16(x_128[0],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    /*
-        temp = m0;
-
-        tempd = &temp;
-        printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    */
     m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    /*
-        temp = m0;
-
-        tempd = (int *)&temp;
-        printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    */
-
     m1=m0;
     m0 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
     y_128[0] = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    //    temps = (int16_t *)&y_128[0];
-    //    printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
-
-
     m0 = _mm_madd_epi16(x_128[1],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
     m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
     m1 = m0;
     m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
     y_128[1] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
-
     m0 = _mm_madd_epi16(x_128[2],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
     m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
     m1 = m0;
     m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
     y_128[2] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
-
     m0 = _mm_madd_epi16(x_128[3],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
     m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
     m1 = m0;
     m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
     y_128[3] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
     if (format==1) {  // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
 
-      //      print_shorts(y_128[0],"y_128[0]=");
       y_128[0] = _mm_shufflelo_epi16(y_128[0],0x1e);
       y_128[0] = _mm_shufflehi_epi16(y_128[0],0x1e);
       ((int16_t*)&y_128[0])[1] = -((int16_t*)&y_128[0])[1];
       ((int16_t*)&y_128[0])[5] = -((int16_t*)&y_128[0])[5];
-      //      print_shorts(y_128[0],"y_128[0]=");
-
-      //      print_shorts(y_128[1],"y_128[1]=");
       y_128[1] = _mm_shufflelo_epi16(y_128[1],0x1e);
       y_128[1] = _mm_shufflehi_epi16(y_128[1],0x1e);
       ((int16_t*)&y_128[1])[1] = -((int16_t*)&y_128[1])[1];
       ((int16_t*)&y_128[1])[5] = -((int16_t*)&y_128[1])[5];
-      //      print_shorts(y_128[1],"y_128[1]=");
-
-      //      print_shorts(y_128[2],"y_128[2]=");
       y_128[2] = _mm_shufflelo_epi16(y_128[2],0x1e);
       y_128[2] = _mm_shufflehi_epi16(y_128[2],0x1e);
       ((int16_t*)&y_128[2])[1] = -((int16_t*)&y_128[2])[1];
       ((int16_t*)&y_128[2])[5] = -((int16_t*)&y_128[2])[5];
-      //      print_shorts(y_128[2],"y_128[2]=");
-
-      //      print_shorts(y_128[3],"y_128[3]=");
       y_128[3] = _mm_shufflelo_epi16(y_128[3],0x1e);
       y_128[3] = _mm_shufflehi_epi16(y_128[3],0x1e);
       ((int16_t*)&y_128[3])[1] = -((int16_t*)&y_128[3])[1];
       ((int16_t*)&y_128[3])[5] = -((int16_t*)&y_128[3])[5];
-      //      print_shorts(y_128[3],"y_128[3]=");
-
     }
 
 
@@ -326,16 +266,6 @@ int rotate_cpx_vector2(int16_t *x,
   __m128i *y_128;
 
 
-  /*
-  printf("rotate_cpx_vector2: %x,%x,%x,%d,%d\n",
-    x,
-    alpha,
-    y,
-    N,
-    output_shift);
-  */
-
-
   shift = _mm_cvtsi32_si128(output_shift);
   x_128 = (__m128i *)&x[0];
 
@@ -361,51 +291,22 @@ int rotate_cpx_vector2(int16_t *x,
 
   y_128 = (__m128i *)&y[0];
 
-  //  _mm_empty();
-  //  return(0);
-
   // we compute 4 cpx multiply for each loop
   for(i=0; i<(N>>1); i++) {
 
 
-    //        temps = (short *)&x_128[i];
-    //        printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    //        temps = (short *)&alpha_128;
-    //        printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
     m0 = _mm_madd_epi16(x_128[i],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    //        temp = m0;
-
-    //        tempd = &temp;
-    //        printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
     m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    //        temp = m0;
-
-    //        tempd = (int *)&temp;
-    //        printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
     m1=m0;
     m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
-
-
     y_128[i] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
     if (format==1) {  // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
 
-      //      print_shorts(y_128[0],"y_128[0]=");
       y_128[i] = _mm_shufflelo_epi16(y_128[i],0x1e);
       y_128[i] = _mm_shufflehi_epi16(y_128[i],0x1e);
       ((int16_t*)&y_128[i])[1] = -((int16_t*)&y_128[i])[1];
       ((int16_t*)&y_128[i])[5] = -((int16_t*)&y_128[i])[5];
-      //      print_shorts(y_128[0],"y_128[0]=");
-
     }
-
   }
 
 
@@ -415,12 +316,13 @@ int rotate_cpx_vector2(int16_t *x,
 
   return(0);
 }
+*/
 
-int rotate_cpx_vector_norep(int16_t *x,
-                            int16_t *alpha,
-                            int16_t *y,
-                            uint32_t N,
-                            uint16_t output_shift)
+int rotate_cpx_vector(int16_t *x,
+                      int16_t *alpha,
+                      int16_t *y,
+                      uint32_t N,
+                      uint16_t output_shift)
 {
   // Multiply elementwise two complex vectors of N elements
   // x        - input 1    in the format  |Re0  Im0 |,......,|Re(N-1) Im(N-1)|
@@ -438,20 +340,13 @@ int rotate_cpx_vector_norep(int16_t *x,
 
   uint32_t i;                 // loop counter
 
-  register __m128i m0,m1,m2,m3;
-
-  //  int16_t *temps;
-  //  int *tempd;
 
-  int *xd;
-  //__m128i *x_128;
-  __m128i *y_128;
-  //  __m128i temp;
+  simd_q15_t *y_128,alpha_128;
+  int32_t *xd=(int32_t *)x; 
 
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  xd = (int *) x;
-  y_128 = (__m128i *) y;
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i shift = _mm_cvtsi32_si128(output_shift);
+  register simd_q15_t m0,m1,m2,m3;
 
   ((int16_t *)&alpha_128)[0] = alpha[0];
   ((int16_t *)&alpha_128)[1] = -alpha[1];
@@ -461,43 +356,55 @@ int rotate_cpx_vector_norep(int16_t *x,
   ((int16_t *)&alpha_128)[5] = -alpha[1];
   ((int16_t *)&alpha_128)[6] = alpha[1];
   ((int16_t *)&alpha_128)[7] = alpha[0];
+#elif defined(__arm__)
+  int32x4_t shift;
+  int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32;
+  int16_t reflip[8]  __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
+  int32x4x2_t xtmp;
 
-  //  _mm_empty();
-  //  return(0);
+  ((int16_t *)&alpha_128)[0] = alpha[0];
+  ((int16_t *)&alpha_128)[1] = alpha[1];
+  ((int16_t *)&alpha_128)[2] = alpha[0];
+  ((int16_t *)&alpha_128)[3] = alpha[1];
+  ((int16_t *)&alpha_128)[4] = alpha[0];
+  ((int16_t *)&alpha_128)[5] = alpha[1];
+  ((int16_t *)&alpha_128)[6] = alpha[0];
+  ((int16_t *)&alpha_128)[7] = alpha[1];
+  int16x8_t bflip = vrev32q_s16(alpha_128);
+  int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip);
+  shift = vdupq_n_s32(-output_shift);
+#endif
+  y_128 = (simd_q15_t *) y;
 
-  for(i=0; i<N>>2; i++) {
 
+  for(i=0; i<N>>2; i++) {
+#if defined(__x86_64__) || defined(__i386__)
     m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]);
     m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]);
-
-    //    printf("i=%d\n",i);
-    //    temps = (short *)x1_128;
-    //    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    //    temps = (short *)x2_128;
-    //    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
     m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
     m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
-
-    //    temp = m0;
-
-    //    tempd = &temp;
-    //    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
     m2 = _mm_sra_epi32(m2,shift);        // shift right by shift in order to  compensate for the input amplitude
     m3 = _mm_sra_epi32(m3,shift);        // shift right by shift in order to  compensate for the input amplitude
 
-    //    temp = m0;
-
-    //    tempd = (int *)&temp;
-    //    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-
     y_128[0] = _mm_packs_epi32(m2,m3);        // pack in 16bit integers with saturation [re im re im re im re im]
+#elif defined(__arm__)
+
+  ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]);
+  ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]);
+  ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]);
+  ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]);
+  re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]),
+                                vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])),
+                   shift);
+  im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]),
+                                vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])),
+                   shift);
 
-    //    temps = (short *)&y_128[0];
-    //    printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
+  xtmp = vzipq_s32(re32,im32);
+  
+  y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1]));
 
+#endif
 
     xd+=4;
     y_128+=1;
@@ -510,7 +417,7 @@ int rotate_cpx_vector_norep(int16_t *x,
   return(0);
 }
 
-
+/*
 int mult_vector32_scalar(int16_t *x1,
                          int x2,
                          int16_t *y,
@@ -530,16 +437,6 @@ int mult_vector32_scalar(int16_t *x1,
 
   uint32_t i;                 // loop counter
 
-  /*
-  #ifdef USER_MODE
-  char *tempc;
-  short *temps;
-  int *tempd;
-  long long *templ;
-  __m128i temp;
-  #endif
-  */
-
   __m128i *x1_128;
   __m128i x2_128;
   __m128i *y_128;
@@ -553,20 +450,6 @@ int mult_vector32_scalar(int16_t *x1,
   // we compute 4 cpx multiply for each loop
   for(i=0; i<(N>>3); i++) {
     y_128[0] = _mm_mul_epu32(x1_128[0],x2_128);
-
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    tempd = (int *)x1_128;
-    printf("x1 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    tempd = (int *)&x2_128;
-    printf("x2 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    //    tempd = (int *)y_128;
-    //    printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    templ = (long long *)y_128;
-    printf("y : %lld,%lld\n",templ[0],templ[1]);
-    #endif
-    */
     y_128[1] = _mm_mul_epu32(x1_128[1],x2_128);
     y_128[2] = _mm_mul_epu32(x1_128[2],x2_128);
     y_128[3] = _mm_mul_epu32(x1_128[3],x2_128);
@@ -582,7 +465,7 @@ int mult_vector32_scalar(int16_t *x1,
 
   return(0);
 }
-
+*/
 
 int complex_conjugate(int16_t *x1,
                       int16_t *y,
@@ -591,46 +474,20 @@ int complex_conjugate(int16_t *x1,
 {
   uint32_t i;                 // loop counter
 
-  /*
-  #ifdef USER_MODE
-  char *tempc;
-  short *temps;
-  int *tempd;
-  long long *templ;
-  __m128i temp;
-  #endif
-  */
-
-  __m128i *x1_128;
-  __m128i x2_128;
-  __m128i *y_128;
-
-
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
-  y_128 = (__m128i *)&y[0];
+  simd_q15_t *x1_128;
+  simd_q15_t *y_128;
+  int16_t x2[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; 
+  simd_q15_t *x2_128 = (simd_q15_t*)&x2[0];
+  x1_128 = (simd_q15_t *)&x1[0];
+  y_128 = (simd_q15_t *)&y[0];
 
 
   // we compute 4 cpx multiply for each loop
   for(i=0; i<(N>>3); i++) {
-    y_128[0] = _mm_mullo_epi16(x1_128[0],x2_128);
-
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    tempd = (int *)x1_128;
-    printf("x1 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    tempd = (int *)&x2_128;
-    printf("x2 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    //    tempd = (int *)y_128;
-    //    printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    templ = (long long *)y_128;
-    printf("y : %lld,%lld\n",templ[0],templ[1]);
-    #endif
-    */
-    y_128[1] = _mm_mullo_epi16(x1_128[1],x2_128);
-    y_128[2] = _mm_mullo_epi16(x1_128[2],x2_128);
-    y_128[3] = _mm_mullo_epi16(x1_128[3],x2_128);
+    y_128[0] = mullo_int16(x1_128[0],*x2_128);
+    y_128[1] = mullo_int16(x1_128[1],*x2_128);
+    y_128[2] = mullo_int16(x1_128[2],*x2_128);
+    y_128[3] = mullo_int16(x1_128[3],*x2_128);
 
 
     x1_128+=4;
@@ -706,15 +563,3 @@ main ()
 #endif //MAIN
 
 
-#else //EXPRESSMIMO_TARGET
-
-int rotate_cpx_vector(int16_t *x,
-                      int16_t *alpha,
-                      int16_t *y,
-                      uint32_t N,
-                      uint16_t output_shift,
-                      uint8_t format)
-{
-
-}
-#endif //EXPRESSMIMO_TARGET
diff --git a/openair1/PHY/TOOLS/cmult_vv.c b/openair1/PHY/TOOLS/cmult_vv.c
index aa94458943..900d661341 100755
--- a/openair1/PHY/TOOLS/cmult_vv.c
+++ b/openair1/PHY/TOOLS/cmult_vv.c
@@ -32,485 +32,96 @@
 #include <stdio.h>
 #endif
 
-#ifndef EXPRESSMIMO_TARGET
-static  __m128i shift     __attribute__ ((aligned(16)));
-static  __m128i m0,m1,m2,m4     __attribute__ ((aligned(16)));
-
-//#define DEBUG_CMULT
-
-int mult_cpx_vector(int16_t *x1,
-                    int16_t *x2,
-                    int16_t *y,
-                    uint32_t N,
-                    int output_shift)
-{
-  // Multiply elementwise two complex vectors of N elements with repeated formatted output
-  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dinamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x2 with a dinamic of 14 bit maximum
-  ///
-  // y        - output     in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-
-  uint32_t i;                 // loop counter
-
-  /*
-  #ifdef USER_MODE
-  int16_t *temps;
-  int *tempd;
-  #endif
-  */
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-
-  //  __m128i temp;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-
-    //msg("mult_cpx_vector: iteration %d, x1=%p, x2=%p, y=%p\n",i,x1_128,x2_128,y_128);
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    temps = (int16_t *)x1_128;
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)x2_128;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    #endif
-    */
-
-    m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    //    temp = m0;
-
-    //    tempd = &temp;
-    //    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    //    temp = m0;
-
-    //    tempd = (int *)&temp;
-    //  printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    y_128[0] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-    /*
-    #ifdef USER_MODE
-    temps = (int16_t *)&y_128[0];
-    printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    #endif USER_MODE
-    */
-
-    m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[1] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-    m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[2] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-    m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[3] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=4;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
+#if defined(__x86_64__) || defined(__i386__)
+int16_t conjug[8]__attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ;
+#define simd_q15_t __m128i
+#define simdshort_q15_t __m64
+#elif defined(__arm__)
+int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;
+#define simd_q15_t int16x8_t
+#define simdshort_q15_t int16x4_t
+#define _mm_empty()
+#define _m_empty()
+#endif
 
-int mult_cpx_vector_unprepared(int16_t *x1,
-                               int16_t *x2,
-                               int16_t *y,
-                               uint32_t N,
-                               int output_shift)
+int mult_cpx_conj_vector(int16_t *x1,
+                         int16_t *x2,
+                         int16_t *y,
+                         uint32_t N,
+                         int output_shift)
 {
   // Multiply elementwise two complex vectors of N elements with repeated formatted output
-  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
+  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
   //            We assume x1 with a dinamic of 15 bit maximum
   //
-  // x2       - input 2    in the format  |Re0 Im0 Re0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
+  // x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
   //            We assume x2 with a dinamic of 14 bit maximum
   ///
-  // y        - output     in the format  |Re0 Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
+  // y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
   //
   // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
   //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
+  // output_shift  - shift to be applied to generate output
 
   uint32_t i;                 // loop counter
 
-#ifdef DEBUG_CMULT
-  int16_t *temps;
-  int *tempd;
+  simd_q15_t *x1_128;
+  simd_q15_t *x2_128;
+  simd_q15_t *y_128;
+#if defined(__x86_64__) || defined(__i386__)
+  simd_q15_t tmp_re,tmp_im;
+  simd_q15_t tmpy0,tmpy1;
+#elif defined(__arm__)
+  int32x4_t tmp_re,tmp_im;
+  int32x4_t tmp_re1,tmp_im1;
+  int16x4x2_t tmpy; 
+  int32x4_t shift = vdupq_n_s32(-output_shift); 
 #endif
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-
-  __m128i shuf_x2;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-
-    //msg("mult_cpx_vector: iteration %d, x1=%p, x2=%p, y=%p\n",i,x1_128,x2_128,y_128);
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    temps = (int16_t *)x1_128;
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)x2_128;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    #endif
-    */
-
-    shuf_x2 = _mm_shufflelo_epi16(x2_128[0],_MM_SHUFFLE(2,3,0,1));
-    shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,0,1));
-
-#ifdef DEBUG_CMULT
-
-    tempd = &shuf_x2;
-    printf("shuf_x2 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-    m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    //    temp = m0;
-
-    //    tempd = &temp;
-    //    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    //    temp = m0;
-
-    //    tempd = (int *)&temp;
-    //  printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    y_128[0] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-    /*
-    #ifdef USER_MODE
-    temps = (int16_t *)&y_128[0];
-    printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    #endif USER_MODE
-    */
-
-    m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[1] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-    m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[2] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-    m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[3] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=4;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-//__attribute__ ((force_align_arg_pointer))
-int mult_cpx_vector_norep(int16_t *x1,
-                          int16_t *x2,
-                          int16_t *y,
-                          uint32_t N,
-                          int output_shift)
-{
-  // Multiply elementwise two complex vectors of N elements with normal formatted output
-  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dinamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x2 with a dinamic of 14 bit maximum
-  ///
-  // y        - output     in the format  |Re0  Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy). WARNING: N>=4;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-
-  uint32_t i;                 // loop counter
-
-  //register __m128i m0,m1,m2,m3;
-
 
-#ifdef DEBUG_CMULT
-  __m128i temp;
-  int *tempd;
-  int16_t *temps;
-#endif //DEBUG_CMULT
+  x1_128 = (simd_q15_t *)&x1[0];
+  x2_128 = (simd_q15_t *)&x2[0];
+  y_128  = (simd_q15_t *)&y[0];
 
 
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-
-  //__m128i temp;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-#ifndef USER_MODE
-  //debug_msg("mult_cpx_vector_norep: x1 %p, x2 %p, y %p, shift %d\n",x1,x2,y,output_shift);
-#endif
-
   // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-
-
-#ifdef DEBUG_CMULT
-    printf("i=%d\n",i);
-    temps = (int16_t *)x1_128;
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)x2_128;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-#endif
-
-
-    m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-
-    tempd = &temp;
-    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-
-    tempd = (int *)&temp;
-    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m1 = m0;
-
-
-
-    m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-#ifdef DEBUG_CMULT
-    printf("i=%d\n",i);
-    temps = (int16_t *)&x1_128[1];
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)&x2_128[1];
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
+  for(i=0; i<(N>>2); i++) {
+  #if defined(__x86_64__) || defined(__i386__)
+    tmp_re = _mm_madd_epi16(*x1_128,*x2_128);
+    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
+    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
+    tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]);
+    tmp_im = _mm_madd_epi16(tmp_im,*x1_128);
+    tmp_re = _mm_srai_epi32(tmp_re,output_shift);
+    tmp_im = _mm_srai_epi32(tmp_im,output_shift);
+    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im);
+    tmpy1  = _mm_unpackhi_epi32(tmp_re,tmp_im);
+    *y_128 = _mm_packs_epi32(tmpy0,tmpy1);
+#elif defined(__arm__)
+
+    tmp_re  = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]);
+    //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])] 
+    tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]);
+    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] 
+    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
+                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
+    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 
+
+    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]);
+    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
+    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]);
+    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
+    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
+                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
+    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
+
+    tmp_re = vqshlq_s32(tmp_re,shift);
+    tmp_im = vqshlq_s32(tmp_im,shift);
+    tmpy   = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im));
+    *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]);
 #endif
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[1] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[1] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-
-    m2 = m0;
-    //    m2 = _mm_packs_epi32(m2,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    //    print_shorts(m2,"m2");
-
-    y_128[0] = _mm_packs_epi32(m1,m2);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-    m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-#ifdef DEBUG_CMULT
-    printf("i=%d\n",i);
-    temps = (int16_t *)&x1_128[2];
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)&x2_128[2];
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-#endif //DEBUG_CMULT
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[2] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[2] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m1 = m0;
-    //    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-#ifdef DEBUG_CMULT
-    printf("i=%d\n",i);
-    temps = (int16_t *)&x1_128[3];
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)&x2_128[3];
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[3] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[3] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m2 = m0;
-    //    m2 = _mm_packs_epi32(m2,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[1] = _mm_packs_epi32(m1,m2);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=2;
+    x1_128++;
+    x2_128++;
+    y_128++;
   }
 
 
@@ -519,1259 +130,3 @@ int mult_cpx_vector_norep(int16_t *x1,
 
   return(0);
 }
-
-
-int mult_cpx_vector_norep_unprepared_conjx2(int16_t *x1,
-    int16_t *x2,
-    int16_t *y,
-    uint32_t N,
-    int output_shift)
-{
-  // Multiply elementwise two complex vectors of N elements with normal formatted output, conjugate x1
-  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dinamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 Im0 Re0 Im0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x2 with a dinamic of 14 bit maximum
-  ///
-  // y        - output     in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy). WARNING: N>=4;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-
-  uint32_t i;                 // loop counter
-
-  //register __m128i m0,m1,m2,m3;
-
-  short conj_x2s[8] __attribute__((aligned(16))) = {1,1,-1,1,1,1,-1,1};
-  __m128i *conj_x2 = (__m128i *)&conj_x2s[0];
-
-#ifdef DEBUG_CMULT
-  __m128i temp;
-  int *tempd;
-  int16_t *temps;
-#endif //DEBUG_CMULT
-
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-
-  __m128i shuf_x2;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-
-
-#ifdef DEBUG_CMULT
-    printf("**i=%d\n",i);
-    temps = (int16_t *)x1_128;
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)x2_128;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-#endif
-
-
-    shuf_x2 = _mm_shufflelo_epi16(x2_128[0],_MM_SHUFFLE(2,3,1,0));
-    shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,1,0));
-    shuf_x2 = _mm_sign_epi16(shuf_x2,*conj_x2);
-#ifdef DEBUG_CMULT
-
-    temps = &shuf_x2;
-    printf("shuf_x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-#endif //DEBUG_CMULT
-
-    m0 = _mm_madd_epi16(x1_128[0],shuf_x2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-
-    tempd = &temp;
-    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-
-    tempd = (int *)&temp;
-    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m1 = m0;
-
-
-
-    shuf_x2 = _mm_shufflelo_epi16(x2_128[1],_MM_SHUFFLE(2,3,1,0));
-    shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,1,0));
-    shuf_x2 = _mm_sign_epi16(shuf_x2,*conj_x2);
-    m0 = _mm_madd_epi16(x1_128[1],shuf_x2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-#ifdef DEBUG_CMULT
-    printf("i=%d\n",i);
-    temps = (int16_t *)&x1_128[1];
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)&x2_128[1];
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-#endif
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[1] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[1] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-
-    m2 = m0;
-    //    m2 = _mm_packs_epi32(m2,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    //    print_shorts(m2,"m2");
-
-    y_128[0] = _mm_packs_epi32(m1,m2);        // 1- pack in a 128 bit register [re im re im]
-
-
-    shuf_x2 = _mm_shufflelo_epi16(x2_128[2],_MM_SHUFFLE(2,3,1,0));
-    shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,1,0));
-    shuf_x2 = _mm_sign_epi16(shuf_x2,*conj_x2);
-    m0 = _mm_madd_epi16(x1_128[2],shuf_x2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-#ifdef DEBUG_CMULT
-    printf("i=%d\n",i);
-    temps = (int16_t *)&x1_128[2];
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)&x2_128[2];
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-#endif //DEBUG_CMULT
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[2] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[2] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m1 = m0;
-    //    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    shuf_x2 = _mm_shufflelo_epi16(x2_128[3],_MM_SHUFFLE(2,3,1,0));
-    shuf_x2 = _mm_shufflehi_epi16(shuf_x2,_MM_SHUFFLE(2,3,1,0));
-    shuf_x2 = _mm_sign_epi16(shuf_x2,*conj_x2);
-
-    m0 = _mm_madd_epi16(x1_128[3],shuf_x2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-#ifdef DEBUG_CMULT
-    printf("i=%d\n",i);
-    temps = (int16_t *)&x1_128[3];
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)&x2_128[3];
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[3] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-#ifdef DEBUG_CMULT
-    temp = m0;
-    tempd = (int *)&temp;
-    printf("m0[3] : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-#endif //DEBUG_CMULT
-
-
-    m2 = m0;
-    //    m2 = _mm_packs_epi32(m2,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[1] = _mm_packs_epi32(m1,m2);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=2;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-static __m128i norep_tmp32 __attribute__ ((aligned(16)));
-
-//__attribute__ ((force_align_arg_pointer))
-int mult_cpx_vector_norep2(int16_t *x1,
-                           int16_t *x2,
-                           int16_t *y,
-                           uint32_t N,
-                           int output_shift)
-{
-  // Multiply elementwise two complex vectors of N elements with normal formatted output and no loop unrollin
-  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0 Re1 Im1 Re1 Im1|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dinamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x2 with a dinamic of 14 bit maximum
-  ///
-  // y        - output     in the format  |Re0  Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy). WARNING: N>=2;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-
-  uint32_t i;                 // loop counter
-
-  //register __m128i m0,m1,m2,m3;
-
-  /*
-  #ifdef USER_MODE
-  __m128i temp;
-  int *tempd;
-  int16_t *temps;
-  #endif
-  */
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  int     *y_32 = (int*)y;
-
-  //  __m128i temp;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-
-#ifndef USER_MODE
-  //debug_msg("mult_cpx_vector_norep2: x1 %p, x2 %p, y %p, shift %d, N %d\n",x1,x2,y,output_shift,N);
-#endif
-
-  // we compute 2 cpx multiply for each loop
-  for(i=0; i<(N>>1); i++) {
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    temps = (int16_t *)x1_128;
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)x2_128;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    #endif
-    */
-
-    m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    /*
-    temp = m0;
-
-    tempd = &temp;
-    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    */
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    /*
-    temp = m0;
-
-    tempd = (int *)&temp;
-    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    */
-
-    norep_tmp32 = _mm_packs_epi32(m0,m0);        // Re0 Im0 Re1 Im1 Re0 Im0 Re1 Im1
-
-    /*
-    #ifdef USER_MODE
-    printf("tmp : %d,%d,%d,%d\n",((int16_t *)&tmp32)[0],((int16_t *)&tmp32)[1],((int16_t *)&tmp32)[2],((int16_t *)&tmp32)[3]);
-    #endif
-    */
-
-    y_32[0] = ((int *)&norep_tmp32)[0];        // 1- pack in a 128 bit register [re im re im]
-    y_32[1] = ((int *)&norep_tmp32)[1];        // 1- pack in a 128 bit register [re im re im]
-
-    x1_128+=1;
-    x2_128+=1;
-    y_32 +=2;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-
-int mult_cpx_vector_norep_conj(int16_t *x1,
-                               int16_t *x2,
-                               int16_t *y,
-                               uint32_t N,
-                               int output_shift)
-{
-  // Multiply elementwise two complex vectors of N elements after conjugating and shuffling x1
-  // x1       - input 1    in the format  |Re0  -Im0 Im0 Re0|,......,|Re(N-1)  -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x1 with a dynamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x2 with a dynamic of 14 bit maximum
-  ///
-  // y        - output     in the format  |Re0  Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy). WARNING: N>=4;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-
-  uint32_t i;                 // loop counter
-
-  //register __m128i m0,m1,m2,m4;
-
-  /*
-  #ifdef USER_MODE
-  int16_t *temps;
-  int *tempw;
-  #endif
-  */
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-
-  //  __m128i temp;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-  //  printf("mult_cpx_vector_norep: shift %d\n",output_shift);
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    temps = (int16_t *)x1_128;
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)x2_128;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    #endif
-    */
-
-    m4 = _mm_shufflelo_epi16(x1_128[0],_MM_SHUFFLE(1,0,1,0));
-    m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0));
-    /*
-    temps = (int16_t *)&m4;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    */
-    m0 = _mm_madd_epi16(m4,x2_128[0]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    //    temp = m0;
-
-    //    tempd = &temp;
-    //    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    /*
-    tempw = (int *)&m0;
-    printf("m0[0] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]);
-    */
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    /*
-    tempw = (int *)&m0;
-    printf("m0[0] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]);
-    */
-    //    temp = m0;
-
-    //    tempd = (int *)&temp;
-    //  printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m1 = m0;
-
-
-
-    m4 = _mm_shufflelo_epi16(x1_128[1],_MM_SHUFFLE(1,0,1,0));
-    m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0));
-    m0 = _mm_madd_epi16(m4,x2_128[1]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    /*
-    tempw = (int *)&m0;
-    printf("m0[1] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]);
-    */
-
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    /*
-    tempw = (int *)&m0;
-    printf("m0[1] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]);
-    */
-
-
-    m2 = m0;
-    //    m2 = _mm_packs_epi32(m2,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    //    print_shorts(m2,"m2");
-
-    y_128[0] = _mm_packs_epi32(m1,m2);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-    m4 = _mm_shufflelo_epi16(x1_128[2],_MM_SHUFFLE(1,0,1,0));
-    m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0));
-    m0 = _mm_madd_epi16(m4,x2_128[2]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    /*
-    tempw = (int *)&m0;
-    printf("m0[2] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]);
-    */
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    /*
-    tempw = (int *)&m0;
-    printf("m0[2] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]);
-    */
-
-    m1 = m0;
-    //    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-
-
-    m4 = _mm_shufflelo_epi16(x1_128[3],_MM_SHUFFLE(1,0,1,0));
-    m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0));
-    m0 = _mm_madd_epi16(m4,x2_128[3]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    /*
-    tempw = (int *)&m0;
-    printf("m0[3] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]);
-    */
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    /*
-    tempw = (int *)&m0;
-    printf("m0[3] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]);
-    */
-
-    m2 = m0;
-    //    m2 = _mm_packs_epi32(m2,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[1] = _mm_packs_epi32(m1,m2);        // 1- pack in a 128 bit register [re im re im]
-
-
-
-
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=2;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-
-int mult_cpx_vector_norep_conj2(int16_t *x1,
-                                int16_t *x2,
-                                int16_t *y,
-                                uint32_t N,
-                                int output_shift)
-{
-  // Multiply elementwise two complex vectors of N elements after conjugating and shuffling x1
-  // x1       - input 1    in the format  |Re0  -Im0 Im0 Re0|,......,|Re(N-1)  -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x1 with a dynamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x2 with a dynamic of 14 bit maximum
-  ///
-  // y        - output     in the format  |Re0  Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy). WARNING: N>=2;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-
-  uint32_t i;                 // loop counter
-
-  //register __m128i m0,m1,m2,m4;
-  __m128i tmp32;
-
-
-
-
-
-
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  int *y_32 = (int *)&y[0];
-
-  //  __m128i temp,*tempd;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-
-
-  //  printf("mult_cpx_vector_norep: shift %d\n",output_shift);
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>1); i++) {
-
-    /*
-        //#ifdef USER_MODE
-        printf("i=%d\n",i);
-        temps = (int16_t *)x1_128;
-        printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-        temps = (int16_t *)x2_128;
-        printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-        //#endif
-    */
-
-    // conjuaget and shuffle x1
-    m4 = _mm_shufflelo_epi16(x1_128[0],_MM_SHUFFLE(1,0,1,0));
-    m4 = _mm_shufflehi_epi16(m4,_MM_SHUFFLE(1,0,1,0));
-    /*
-        temps = (int16_t *)&m4;
-        printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    */
-    m0 = _mm_madd_epi16(m4,x2_128[0]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-    //    tempw = (int *)&m0;
-    //   printf("m0[1] : %d,%d,%d,%d\n",tempw[0],tempw[1],tempw[2],tempw[3]);
-
-
-
-
-    tmp32 = _mm_packs_epi32(m0,m0);        // Re0 Im0 Re1 Im1 Re0 Im0 Re1 Im1
-
-
-
-    //        printf("tmp : %d,%d,%d,%d\n",((int16_t *)&tmp32)[0],((int16_t *)&tmp32)[1],((int16_t *)&tmp32)[2],((int16_t *)&tmp32)[3]);
-
-    y_32[0] = ((int *)&tmp32)[0];        // 1- pack in a 128 bit register [re im re im]
-    y_32[1] = ((int *)&tmp32)[1];        // 1- pack in a 128 bit register [re im re im]
-
-
-
-    x1_128+=1;
-    x2_128+=1;
-    y_32 +=2;
-
-
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-
-int mult_cpx_vector2(int16_t *x1,
-                     int16_t *x2,
-                     int16_t *y,
-                     uint32_t N,
-                     int output_shift)
-{
-  // Multiply elementwise two complex vectors of N elements
-  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dinamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x2 with a dinamic of 14 bit maximum
-  //
-  // y        - output     in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N must be a multiple of 2;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-
-  uint32_t i;                 // loop counter
-
-  //register __m128i m0,m1;
-
-  /*
-  #ifdef USER_MODE
-  int16_t *temps;
-  int *tempd;
-  #endif
-  */
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-
-  //  __m128i temp;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-
-
-  for(i=0; i<(N>>1); i++) {
-
-    /* #ifdef USER_MODE */
-    /*     printf("i=%d\n",i); */
-    /*     temps = (int16_t *)x1_128; */
-    /*     printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); */
-    /*     temps = (int16_t *)x2_128; */
-    /*     printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); */
-    /* #endif */
-
-    m0 = _mm_madd_epi16(x1_128[i],x2_128[i]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    //    temp = m0;
-
-    //    tempd = &temp;
-    //    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    //    temp = m0;
-
-    //    tempd = (int *)&temp;
-    //  printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    y_128[i] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-int mult_cpx_vector_add(int16_t *x1,
-                        int16_t *x2,
-                        int16_t *y,
-                        uint32_t N,
-                        int output_shift)
-{
-  // Multiply elementwise two complex vectors of N elements and add it to y
-  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dinamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x2 with a dinamic of 14 bit maximum
-  //
-  // y        - output     in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-
-  uint32_t i;                 // loop counter
-
-  //register __m128i m0,m1;
-
-  /*
-  #ifdef USER_MODE
-  int16_t *temps;
-  int *tempd;
-  __m128i temp;
-  #endif
-  */
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    //unroll 0
-
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    temps = (int16_t *)x1_128;
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)x2_128;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    #endif
-    */
-
-    m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    //    temp = m0;
-
-    //    tempd = &temp;
-    //    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    //    temp = m0;
-
-    //    tempd = (int *)&temp;
-    //  printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-
-    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-    m0 = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[0] = _mm_add_epi16(m0,y_128[0]);
-
-
-    //        temps = (int16_t *)&y_128[0];
-    //    printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
-
-    //unroll 1
-    m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    m1 = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[1] = _mm_add_epi16(m1,y_128[1]);
-
-
-    //unroll 2
-    m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    m1 = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[2] = _mm_add_epi16(m1,y_128[2]);
-
-
-
-    //unroll 3
-    m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-
-
-
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    m1 = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[3] = _mm_add_epi16(m1,y_128[3]);
-
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=4;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-int mult_cpx_vector_add32(int16_t *x1,
-                          int16_t *x2,
-                          int16_t *y,
-                          uint32_t N)
-
-{
-  // Multiply elementwise two complex vectors of N elements and add it to y
-  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dinamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-  //            We assume x2 with a dinamic of 14 bit maximum
-  //
-  // y        - output     in the format  |Re0 (32bit)  Im0 (32bit) |,......,|Re(N-1) (32bit)  Im(N-1) (32bit)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-
-  uint32_t i;                 // loop counter
-  //register __m128i m0;
-
-  /*
-  #ifdef USER_MODE
-  int16_t *temps;
-  int *tempd;
-  __m128i temp;
-  #endif
-  */
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-
-
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    //unroll 0
-
-
-    m0 = _mm_madd_epi16(x1_128[0],x2_128[0]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0
-    y_128[0] = _mm_add_epi32(y_128[0],m0);
-
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    temps = (int16_t *)x1_128;
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)x2_128;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    tempd = (int *)y_128;
-    printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    #endif
-    */
-
-
-    m0 = _mm_madd_epi16(x1_128[1],x2_128[1]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    y_128[1] = _mm_add_epi32(y_128[1],m0);
-
-    m0 = _mm_madd_epi16(x1_128[2],x2_128[2]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    y_128[2] = _mm_add_epi32(y_128[2],m0);
-
-    m0 = _mm_madd_epi16(x1_128[3],x2_128[3]); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    y_128[3] = _mm_add_epi32(y_128[3],m0);
-
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=4;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-int mult_vector32(int16_t *x1,
-                  int16_t *x2,
-                  int16_t *y,
-                  uint32_t N)
-
-{
-  // Multiply elementwise two real vectors of N elements y = real(x1).*real(x2)
-  // x1       - input 1    in the format  |Re(0)  xxx  Re(1) xxx|,......,|Re(N-2) xxx Re(N-1) xxx|
-  //            We assume x1 with a dinamic of 31 bit maximum
-  //
-  // x1       - input 2    in the format  |Re(0)  xxx Re(2) xxx|,......,|Re(N-2)  xxx Re(N-1) xxx|
-  //            We assume x2 with a dinamic of 31 bit maximum
-  //
-  // y        - output     in the format  |Re0 (64bit) |,......,|Re(N-1) (64bit)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-
-  uint32_t i;                 // loop counter
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-
-
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    y_128[0] = _mm_mul_epu32(x1_128[0],x2_128[0]);
-
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    tempd = (int *)x1_128;
-    printf("x1 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    tempd = (int *)x2_128;
-    printf("x2 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    //    tempd = (int *)y_128;
-    //    printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    templ = (long long *)y_128;
-    printf("y : %lld,%lld\n",templ[0],templ[1]);
-    #endif
-    */
-
-    y_128[1] = _mm_mul_epu32(x1_128[1],x2_128[1]);
-    y_128[2] = _mm_mul_epu32(x1_128[2],x2_128[2]);
-    y_128[3] = _mm_mul_epu32(x1_128[3],x2_128[3]);
-
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=4;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-
-/*
-The following code does not work, because there is no signed 32bit multiplication intrinsic. It only works for unsigned values
-
-int mult_cpx_vector32_conj(int16_t *x,
-         int16_t *y,
-         uint32_t N)
-
-{
-  // elementwise multiplication of two complex vectors of N elements such that y = x * conj(x) = real(x)*real(x)+imag(x)*imag(x)
-  // x        - input      in the format  |Re(0)  Im(0) Re(1) Im(1) |,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
-  //            We assume x with a dinamic of 31 bit maximum
-  //
-  // y        - output     in the format  |Re0 (64bit) |,......,|Re(N-1) (64bit)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-
-  uint32_t i;                 // loop counter
-
-#ifdef USER_MODE
-  char *tempc;
-  int16_t *temps;
-  int *tempd;
-  long long *templ;
-  __m128i temp;
-#endif
-
-  __m128i *x_128;
-  __m128i *y_128;
-
-  __m128i m0,m1,m2,m3;
-
-  x_128 = (__m128i *)&x[0];
-  y_128 = (__m128i *)&y[0];
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0;i<(N>>3);i++)
-  {
-    // Re(a)*Re(b)
-    m0 = _mm_mul_epu32(x_128[0],x_128[0]);
-    // Im(a)*Im(b)
-    m1 = _mm_shuffle_epi32(x_128[0],_MM_SHUFFLE(2,3,0,1));
-    m3 = _mm_mul_epu32(m1,m1);
-    // Re(a)*Re(b)+Im(a)*Im(b)
-    y_128[0] = _mm_add_epi64(m0,m3);
-
-#ifdef USER_MODE
-    printf("i=%d\n",i);
-    tempd = (int *)x_128;
-    printf("x : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    templ = (long long *)&m0;
-    printf("m0 : %lld,%lld\n",templ[0],templ[1]);
-    tempd = (int *)&m1;
-    printf("m1 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    templ = (long long *)&m3;
-    printf("m3 : %lld,%lld\n",templ[0],templ[1]);
-    //    tempd = (int *)y_128;
-    //    printf("y : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-    templ = (long long *)y_128;
-    printf("y : %lld,%lld\n",templ[0],templ[1]);
-#endif
-
-    // Re(a)*Re(b)
-    m0 = _mm_mul_epu32(x_128[1],x_128[1]);
-    // Im(a)*Im(b)
-    m1 = _mm_shuffle_epi32(x_128[1],_MM_SHUFFLE(1,0,3,2));
-    m3 = _mm_mul_epu32(m1,m1);
-    // Re(a)*Re(b)+Im(a)*Im(b)
-    y_128[1] = _mm_add_epi64(m0,m3);
-
-
-    // Re(a)*Re(b)
-    m0 = _mm_mul_epu32(x_128[2],x_128[2]);
-    // Im(a)*Im(b)
-    m1 = _mm_shuffle_epi32(x_128[2],_MM_SHUFFLE(1,0,3,2));
-    m3 = _mm_mul_epu32(m1,m1);
-    // Re(a)*Re(b)+Im(a)*Im(b)
-    y_128[2] = _mm_add_epi64(m0,m3);
-
-
-    // Re(a)*Re(b)
-    m0 = _mm_mul_epu32(x_128[3],x_128[3]);
-    // Im(a)*Im(b)
-    m1 = _mm_shuffle_epi32(x_128[3],_MM_SHUFFLE(1,0,3,2));
-    m3 = _mm_mul_epu32(m1,m1);
-    // Re(a)*Re(b)+Im(a)*Im(b)
-    y_128[3] = _mm_add_epi64(m0,m3);
-
-
-    x_128+=4;
-    y_128 +=4;
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-*/
-
-int mult_cpx_vector32_conj(int16_t *x,
-                           int16_t *y,
-                           uint32_t N)
-
-{
-  // Elementwise multiplication of two complex vectors of N elements such that y = x * conj(x) = real(x)*real(x)+imag(x)*imag(x)
-  // x        - input      in the format  |Re(0)  Im(0) Re(1) Im(1) |,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
-  //                       We assume x1 with a dinamic of 31 bit maximum
-  //
-  // y        - output     in the format  |Re0 (64bit) |,......,|Re(N-1) (64bit)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-
-  uint32_t i;                 // loop counter
-
-  int *xi = (int*) x;
-  long long int *yl = (long long int*) y;
-
-  long long int temp1,temp2;
-
-
-  for(i=0; i<N/2; i++) {
-    // Re(a)*Re(b)
-    temp1 = ((long long int) xi[0])* ((long long int) xi[0]);
-    // Im(a)*Im(b)
-    temp2 = ((long long int) xi[1])* ((long long int) xi[1]);
-    yl[0] = temp1+temp2;
-
-    temp1 = ((long long int) xi[2])* ((long long int) xi[2]);
-    temp2 = ((long long int) xi[3])* ((long long int) xi[3]);
-    yl[1] = temp1+temp2;
-
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    printf("x1 : %d,%d,%d,%d\n",x1i[0],x1i[1],x1i[2],x1i[3]);
-    printf("x2 : %d,%d,%d,%d\n",x2i[0],x2i[1],x2i[2],x2i[3]);
-    printf("temp : %lld,%lld\n",temp1,temp2);
-    printf("y : %lld,%lld\n",yl[0],yl[1]);
-    #endif
-    */
-
-    xi+=4;
-    yl +=2;
-  }
-
-  return(0);
-}
-
-
-int shift_and_pack(int16_t *y,
-                   uint32_t N,
-                   int output_shift)
-{
-  uint32_t i;                 // loop counter
-
-  //register __m128i m0,m1;
-
-  /*
-  #ifdef USER_MODE
-  int16_t *temps;
-  int *tempd;
-  __m128i temp;
-  #endif
-  */
-
-  __m128i *y_128;
-
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  y_128 = (__m128i *)&y[0];
-
-
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    /*
-    #ifdef USER_MODE
-    printf("i=%d\n",i);
-    temps = (int16_t *)x1_128;
-    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    temps = (int16_t *)x2_128;
-    printf("x2 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    #endif
-    */
-
-    //unroll 0
-    m0 = _mm_sra_epi32(y_128[0],shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-    y_128[0] = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    //    temps = (int16_t *)&y_128[0];
-    //    printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
-    //unroll 1
-    m1 = _mm_sra_epi32(y_128[1],shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    m1 = _mm_packs_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-    y_128[1] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-    //unroll 2
-    m1 = _mm_sra_epi32(y_128[2],shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    m1 = _mm_packs_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-    y_128[2] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-    //unroll 3
-    m1 = _mm_sra_epi32(y_128[3],shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    m1 = _mm_packs_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-    y_128[3] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128 +=4;
-  }
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-
-#ifdef MAIN
-#define L 16
-
-main ()
-{
-
-  int16_t input[256] __attribute__((aligned(16)));
-  int16_t input2[256] __attribute__((aligned(16)));
-  int16_t output[256] __attribute__((aligned(16)));
-
-  int i;
-
-  input[0] = 100;
-  input[1] = 200;
-  input[2] = -200;
-  input[3] = 100;
-  input[4] = 1000;
-  input[5] = 2000;
-  input[6] = -2000;
-  input[7] = 1000;
-  input[8] = 100;
-  input[9] = 200;
-  input[10] = -200;
-  input[11] = 100;
-  input[12] = 1000;
-  input[13] = 2000;
-  input[14] = -2000;
-  input[15] = 1000;
-
-  input2[0] = 1;
-  input2[1] = 2;
-  input2[2] = 1;
-  input2[3] = 2;
-  input2[4] = 10;
-  input2[5] = 20;
-  input2[6] = 10;
-  input2[7] = 20;
-  input2[8] = 1;
-  input2[9] = 2;
-  input2[10] = 1;
-  input2[11] = 2;
-  input2[12] = 1000;
-  input2[13] = 2000;
-  input2[14] = 1000;
-  input2[15] = 2000;
-
-
-  mult_cpx_vector32_conj(input,output,8);
-
-
-}
-
-#endif //MAIN
-
-
-#else  //EXPRESSMIMO_TARGET
-
-/*
-int mult_cpx_vector(int16_t *x1,
-        int16_t *x2,
-        int16_t *y,
-        uint32_t N,
-        uint16_t output_shift)
-{
-
-}
-*/
-
-#endif //EXPRESSMIMO_TARGET
diff --git a/openair1/PHY/TOOLS/defs.h b/openair1/PHY/TOOLS/defs.h
index d9783041e4..7d8942dd72 100644
--- a/openair1/PHY/TOOLS/defs.h
+++ b/openair1/PHY/TOOLS/defs.h
@@ -339,14 +339,13 @@ void dft3072(int16_t *sigF,int16_t *sig);
 void dft24576(int16_t *sigF,int16_t *sig);
 
 
-/*!\fn int rotate_cpx_vector(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N,uint16_t output_shift, uint8_t format)
+/*!\fn int32_t rotate_cpx_vector(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N,uint16_t output_shift)
 This function performs componentwise multiplication of a vector with a complex scalar.
-@param x Vector input (Q1.15)  in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
+@param x Vector input (Q1.15)  in the format  |Re0  Im0|,......,|Re(N-1) Im(N-1)|
 @param alpha Scalar input (Q1.15) in the format  |Re0 Im0|
-@param y Output (Q1.15) in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
+@param y Output (Q1.15) in the format  |Re0  Im0|,......,|Re(N-1) Im(N-1)|
 @param N Length of x WARNING: N>=4
 @param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-@param format Format 0 indicates that alpha is in shuffled format during multiply (Re -Im Im Re), whereas 1 indicates that input is in this format (i.e. a matched filter)
 
 The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$
 */
@@ -354,49 +353,15 @@ int32_t rotate_cpx_vector(int16_t *x,
                           int16_t *alpha,
                           int16_t *y,
                           uint32_t N,
-                          uint16_t output_shift,
-                          uint8_t format);
-
-/*!\fn int32_t rotate_cpx_vector2(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N,uint16_t output_shift,uint8_t format)
-This function performs componentwise multiplication of a vector with a complex scalar.
-@param x Vector input (Q1.15)  in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param alpha Scalar input (Q1.15) in the format  |Re0 Im0|
-@param y Output (Q1.15) in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param N Length of x WARNING: N must be multiple of 2 (the routine performs two complex multiplies per cycle)
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-@param format Format 0 indicates that alpha is in shuffled format during multiply (Re -Im Im Re), whereas 1 indicates that input is in this format (i.e. a matched filter)
-The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$
-*/
-int32_t rotate_cpx_vector2(int16_t *x,
-                           int16_t *alpha,
-                           int16_t *y,
-                           uint32_t N,
-                           uint16_t output_shift,
-                           uint8_t format);
-
-/*!\fn int32_t rotate_cpx_vector_norep(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N,uint16_t output_shift)
-This function performs componentwise multiplication of a vector with a complex scalar.
-@param x Vector input (Q1.15)  in the format  |Re0  Im0|,......,|Re(N-1) Im(N-1)|
-@param alpha Scalar input (Q1.15) in the format  |Re0 Im0|
-@param y Output (Q1.15) in the format  |Re0  Im0|,......,|Re(N-1) Im(N-1)|
-@param N Length of x WARNING: N>=4
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-
-The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$
-*/
-int32_t rotate_cpx_vector_norep(int16_t *x,
-                                int16_t *alpha,
-                                int16_t *y,
-                                uint32_t N,
-                                uint16_t output_shift);
+                          uint16_t output_shift);
 
 
 
 /*!\fn int32_t add_cpx_vector(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N)
 This function performs componentwise addition of a vector with a complex scalar.
-@param x Vector input (Q1.15)  in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
+@param x Vector input (Q1.15)  in the format  |Re0  Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
 @param alpha Scalar input (Q1.15) in the format  |Re0 Im0|
-@param y Output (Q1.15) in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
+@param y Output (Q1.15) in the format  |Re0  Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
 @param N Length of x WARNING: N>=4
 
 The function implemented is : \f$\mathbf{y} = \alpha + \mathbf{x}\f$
diff --git a/openair1/PHY/TOOLS/lte_dfts.c b/openair1/PHY/TOOLS/lte_dfts.c
index 6a79a9bd3a..f2eb3f0e65 100644
--- a/openair1/PHY/TOOLS/lte_dfts.c
+++ b/openair1/PHY/TOOLS/lte_dfts.c
@@ -40,6 +40,7 @@
 #include "defs.h"
 #else
 #include "time_meas.h"
+#include <math.h>
 
 #define debug_msg
 #define ONE_OVER_SQRT2_Q15 23170
@@ -49,12 +50,15 @@
 
 #include "PHY/sse_intrin.h"
 
+#define print_shorts(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
+#define print_ints(s,x) printf("%s %d %d %d %d\n",s,(x)[0],(x)[1],(x)[2],(x)[3])
 
 static int16_t conjugatedft[8] __attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ;
 
 
 static short reflip[8]  __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
 
+#if defined(__x86_64__) || defined(__i386__)
 static inline void cmac(__m128i a,__m128i b, __m128i *re32, __m128i *im32) __attribute__((always_inline));
 static inline void cmac(__m128i a,__m128i b, __m128i *re32, __m128i *im32)
 {
@@ -122,8 +126,6 @@ static inline void cmultc(__m128i a,__m128i b, __m128i *re32, __m128i *im32)
 
   *re32     = _mm_madd_epi16(a,b);
   mmtmpb    = _mm_sign_epi16(b,*(__m128i*)reflip);
-  //  mmtmpb    = _mm_shufflelo_epi16(mmtmpb,_MM_SHUFFLE(2,3,0,1));
-  //  mmtmpb    = _mm_shufflehi_epi16(mmtmpb,_MM_SHUFFLE(2,3,0,1));
   mmtmpb    = _mm_shuffle_epi8(mmtmpb,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   *im32  = _mm_madd_epi16(a,mmtmpb);
 
@@ -178,34 +180,166 @@ static inline __m128i packed_cmult2(__m128i a,__m128i b,__m128i b2)
 
   cre       = _mm_madd_epi16(a,b);
   cim       = _mm_madd_epi16(a,b2);
-  /*
-  mmtmpb    = _mm_sign_epi16(b,*(__m128i*)reflip);
-  cre       = _mm_madd_epi16(a,mmtmpb);
-  mmtmpb    = _mm_shufflelo_epi16(b,_MM_SHUFFLE(2,3,0,1));
-  mmtmpb    = _mm_shufflehi_epi16(mmtmpb,_MM_SHUFFLE(2,3,0,1));
-  cim       = _mm_madd_epi16(a,mmtmpb);
-  */
-  /*
-  __m128i cre,cim;
-  cmult(a,b,&cre,&cim);
-  */
 
   return(cpack(cre,cim));
 
 }
 
-/*
-static inline __m128i packed_cmultc2(__m128i a,__m128i b,__m128i b2) __attribute__((always_inline));
+#elif defined (__arm__)
+static inline void cmac(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) __attribute__((always_inline));
+static inline void cmac(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32)
+{
 
-static inline __m128i packed_cmultc2(__m128i a,__m128i b,__m128i b2) {
+  
+  int32x4_t ab_re0,ab_re1,ab_im0,ab_im1;
+  int16x8_t bflip = vrev32q_s16(b);
+  int16x8_t bconj = vmulq_s16(b,*(int16x8_t *)reflip);
+
+  ab_re0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&bconj)[0]);
+  ab_re1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&bconj)[1]);
+  ab_im0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&bflip)[0]);
+  ab_im1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&bflip)[1]);
+  *re32 = vqaddq_s32(*re32,vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]),
+					vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])));
+  *im32 = vqaddq_s32(*im32,vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]),
+					vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])));
+}
 
-  __m128i cre,cim;
+static inline void cmacc(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) __attribute__((always_inline));
+static inline void cmacc(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32)
+{
+  int32x4_t ab_re0,ab_re1,ab_im0,ab_im1;
+  int16x8_t bconj = vmulq_s16(b,*(int16x8_t *)reflip);
+  int16x8_t bflip = vrev32q_s16(bconj);
+
+  ab_re0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&b)[0]);
+  ab_re1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&b)[1]);
+  ab_im0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&bflip)[0]);
+  ab_im1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&bflip)[1]);
+  *re32 = vqaddq_s32(*re32,vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]),
+					vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])));
+  *im32 = vqaddq_s32(*im32,vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]),
+					vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])));
+
+}
+
+static inline void cmult(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) __attribute__((always_inline));
+static inline void cmult(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32)
+{
+  int32x4_t ab_re0,ab_re1,ab_im0,ab_im1;
+  int16x8_t bflip = vrev32q_s16(b);
+  int16x8_t bconj = vmulq_s16(b,*(int16x8_t *)reflip);
+  int16x4_t al,ah,bcl,bch,bfl,bfh;
+  int32x2_t abr0l,abr0h,abr1l,abr1h,abi0l,abi0h,abi1l,abi1h;
+
+  al  = vget_low_s16(a);      ah = vget_high_s16(a);
+  bcl = vget_low_s16(bconj);  bch = vget_high_s16(bconj);
+  bfl = vget_low_s16(bflip);  bfh = vget_high_s16(bflip);
+
+  ab_re0 = vmull_s16(al,bcl);
+  ab_re1 = vmull_s16(ah,bch);
+  ab_im0 = vmull_s16(al,bfl);
+  ab_im1 = vmull_s16(ah,bfh);
+  abr0l = vget_low_s32(ab_re0); abr0h = vget_high_s32(ab_re0);
+  abr1l = vget_low_s32(ab_re1); abr1h = vget_high_s32(ab_re1);
+  abi0l = vget_low_s32(ab_im0); abi0h = vget_high_s32(ab_im0);
+  abi1l = vget_low_s32(ab_im1); abi1h = vget_high_s32(ab_im1);
+
+  *re32 = vcombine_s32(vpadd_s32(abr0l,abr0h),
+                       vpadd_s32(abr1l,abr1h));
+  *im32 = vcombine_s32(vpadd_s32(abi0l,abi0h),
+                       vpadd_s32(abi1l,abi1h));
+}
+
+static inline void cmultc(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32) __attribute__((always_inline));
+
+static inline void cmultc(int16x8_t a,int16x8_t b, int32x4_t *re32, int32x4_t *im32)
+{
+  int32x4_t ab_re0,ab_re1,ab_im0,ab_im1;
+  int16x8_t bconj = vmulq_s16(b,*(int16x8_t *)reflip);
+  int16x8_t bflip = vrev32q_s16(bconj);
+  int16x4_t al,ah,bl,bh,bfl,bfh; 
+  int32x2_t abr0l,abr0h,abr1l,abr1h,abi0l,abi0h,abi1l,abi1h;
+  al  = vget_low_s16(a);     ah = vget_high_s16(a);
+  bl  = vget_low_s16(b);     bh = vget_high_s16(b);
+  bfl = vget_low_s16(bflip); bfh = vget_high_s16(bflip);
+
+  ab_re0 = vmull_s16(al,bl);
+  ab_re1 = vmull_s16(ah,bh);
+  ab_im0 = vmull_s16(al,bfl);
+  ab_im1 = vmull_s16(ah,bfh);
+
+  abr0l = vget_low_s32(ab_re0); abr0h = vget_high_s32(ab_re0);
+  abr1l = vget_low_s32(ab_re1); abr1h = vget_high_s32(ab_re1);
+  abi0l = vget_low_s32(ab_im0); abi0h = vget_high_s32(ab_im0);
+  abi1l = vget_low_s32(ab_im1); abi1h = vget_high_s32(ab_im1);
+
+  *re32 = vcombine_s32(vpadd_s32(abr0l,abr0h),
+		       vpadd_s32(abr1l,abr1h));
+  *im32 = vcombine_s32(vpadd_s32(abi0l,abi0h),
+		       vpadd_s32(abi1l,abi1h));
+
+}
+
+
+static inline int16x8_t cpack(int32x4_t xre,int32x4_t xim) __attribute__((always_inline));
+
+static inline int16x8_t cpack(int32x4_t xre,int32x4_t xim)
+{
+  int32x4x2_t xtmp;
+
+  xtmp = vzipq_s32(xre,xim);
+  return(vcombine_s16(vqshrn_n_s32(xtmp.val[0],15),vqshrn_n_s32(xtmp.val[1],15)));
+
+}
+
+
+static inline void packed_cmult(int16x8_t a,int16x8_t b, int16x8_t *c) __attribute__((always_inline));
+
+static inline void packed_cmult(int16x8_t a,int16x8_t b, int16x8_t *c)
+{
+
+  int32x4_t cre,cim;
+  cmult(a,b,&cre,&cim);
+  *c = cpack(cre,cim);
+
+}
+
+
+static inline void packed_cmultc(int16x8_t a,int16x8_t b, int16x8_t *c) __attribute__((always_inline));
+
+static inline void packed_cmultc(int16x8_t a,int16x8_t b, int16x8_t *c)
+{
+
+  int32x4_t cre,cim;
 
   cmultc(a,b,&cre,&cim);
+  *c = cpack(cre,cim);
+
+}
+
+static inline int16x8_t packed_cmult2(int16x8_t a,int16x8_t b,  int16x8_t b2) __attribute__((always_inline));
+
+static inline int16x8_t packed_cmult2(int16x8_t a,int16x8_t b,  int16x8_t b2)
+{
+
+  
+
+  int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,cre,cim;
+  
+  ab_re0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&b)[0]);
+  ab_re1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&b)[1]);
+  ab_im0 = vmull_s16(((int16x4_t*)&a)[0],((int16x4_t*)&b2)[0]);
+  ab_im1 = vmull_s16(((int16x4_t*)&a)[1],((int16x4_t*)&b2)[1]);
+  cre = vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]),
+		     vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1]));
+  cim = vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]),
+		     vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1]));
   return(cpack(cre,cim));
 
 }
-*/
+
+#endif
 
 static int16_t W0s[8]__attribute__((aligned(16))) = {32767,0,32767,0,32767,0,32767,0};
 
@@ -217,6 +351,7 @@ static int16_t W25s[8]__attribute__((aligned(16))) = {-26509,-19260,-26509,-1926
 static int16_t W35s[8]__attribute__((aligned(16))) = {-26510,19260,-26510,19260,-26510,19260,-26510,19260};
 static int16_t W45s[8]__attribute__((aligned(16))) = {10126,31163,10126,31163,10126,31163,10126,31163};
 
+#if defined(__x86_64__) || defined(__i386__)
 __m128i *W0 = (__m128i *)W0s;
 __m128i *W13 = (__m128i *)W13s;
 __m128i *W23 = (__m128i *)W23s;
@@ -224,7 +359,15 @@ __m128i *W15 = (__m128i *)W15s;
 __m128i *W25 = (__m128i *)W25s;
 __m128i *W35 = (__m128i *)W35s;
 __m128i *W45 = (__m128i *)W45s;
-
+#elif defined(__arm__)
+int16x8_t *W0  = (int16x8_t *)W0s;
+int16x8_t *W13 = (int16x8_t *)W13s;
+int16x8_t *W23 = (int16x8_t *)W23s;
+int16x8_t *W15 = (int16x8_t *)W15s;
+int16x8_t *W25 = (int16x8_t *)W25s;
+int16x8_t *W35 = (int16x8_t *)W35s;
+int16x8_t *W45 = (int16x8_t *)W45s;
+#endif
 static int16_t dft_norm_table[16] = {9459,  //12
                                      6689,//24
                                      5461,//36
@@ -244,6 +387,7 @@ static int16_t dft_norm_table[16] = {9459,  //12
                                     }; //sqrt(5) //300
 
 
+#if defined(__x86_64__) || defined(__i386__)
 static inline void bfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m128i *tw)__attribute__((always_inline));
 
 static inline void bfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m128i *tw)
@@ -270,6 +414,31 @@ static inline void bfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m12
   *y1 = _mm_packs_epi32(bfly2_tmp1,bfly2_tmp2);
 }
 
+#elif defined(__arm__)
+
+static inline void bfly2(int16x8_t *x0, int16x8_t *x1,int16x8_t *y0, int16x8_t *y1,int16x8_t *tw)__attribute__((always_inline));
+
+static inline void bfly2(int16x8_t *x0, int16x8_t *x1,int16x8_t *y0, int16x8_t *y1,int16x8_t *tw)
+{
+
+  int32x4_t x0r_2,x0i_2,x1r_2,x1i_2,dy0r,dy1r,dy0i,dy1i;
+
+  cmult(*(x0),*(W0),&x0r_2,&x0i_2);
+  cmult(*(x1),*(tw),&x1r_2,&x1i_2);
+
+  dy0r = vqaddq_s32(x0r_2,x1r_2);
+  dy1r = vqsubq_s32(x0r_2,x1r_2);
+  dy0i = vqaddq_s32(x0i_2,x1i_2);
+  dy1i = vqsubq_s32(x0i_2,x1i_2);
+
+  *y0 = cpack(dy0r,dy0i);
+  *y1 = cpack(dy1r,dy1i);
+}
+
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 static inline void bfly2_tw1(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1)__attribute__((always_inline));
 
 static inline void bfly2_tw1(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1)
@@ -280,6 +449,20 @@ static inline void bfly2_tw1(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1)
 
 }
 
+#elif defined(__arm__)
+
+static inline void bfly2_tw1(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8_t *y1)__attribute__((always_inline));
+
+static inline void bfly2_tw1(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8_t *y1)
+{
+
+  *y0  = vqaddq_s16(*x0,*x1);
+  *y1  = vqsubq_s16(*x0,*x1);
+
+}
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 static inline void bfly2_16(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1, __m128i *tw, __m128i *twb)__attribute__((always_inline));
 
 static inline void bfly2_16(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1, __m128i *tw, __m128i *twb)
@@ -295,6 +478,25 @@ static inline void bfly2_16(__m128i *x0, __m128i *x1, __m128i *y0, __m128i *y1,
 }
 
 
+#elif defined(__arm__)
+
+static inline void bfly2_16(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8_t *y1, int16x8_t *tw, int16x8_t *twb)__attribute__((always_inline));
+
+static inline void bfly2_16(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8_t *y1, int16x8_t *tw, int16x8_t *twb)
+{
+
+  register int16x8_t x1t;
+
+  x1t = packed_cmult2(*(x1),*(tw),*(twb));
+
+  *y0  = vqaddq_s16(*x0,x1t);
+  *y1  = vqsubq_s16(*x0,x1t);
+
+}
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 static inline void ibfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m128i *tw)__attribute__((always_inline));
 
 static inline void ibfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m128i *tw)
@@ -320,10 +522,34 @@ static inline void ibfly2(__m128i *x0, __m128i *x1,__m128i *y0, __m128i *y1,__m1
   bfly2_tmp2 = _mm_unpackhi_epi32(dy1r,dy1i);
   *y1 = _mm_packs_epi32(bfly2_tmp1,bfly2_tmp2);
 }
+#elif defined(__arm__)
+static inline void ibfly2(int16x8_t *x0, int16x8_t *x1,int16x8_t *y0, int16x8_t *y1,int16x8_t *tw)
+{
+
+  int32x4_t x0r_2,x0i_2,x1r_2,x1i_2,dy0r,dy1r,dy0i,dy1i;
+
+  cmultc(*(x0),*(W0),&x0r_2,&x0i_2);
+  cmultc(*(x1),*(tw),&x1r_2,&x1i_2);
+
+  dy0r = vqaddq_s32(x0r_2,x1r_2);
+  dy1r = vqsubq_s32(x0r_2,x1r_2);
+  dy0i = vqaddq_s32(x0i_2,x1i_2);
+  dy1i = vqsubq_s32(x0i_2,x1i_2);
+
+  *y0 = cpack(dy0r,dy0i);
+  *y1 = cpack(dy1r,dy1i);
+
+}
+
+#endif
+
 
 
 
 // This is the radix-3 butterfly (fft)
+
+#if defined(__x86_64__) || defined(__i386__)
+
 static inline void bfly3(__m128i *x0,__m128i *x1,__m128i *x2,
                          __m128i *y0,__m128i *y1,__m128i *y2,
                          __m128i *tw1,__m128i *tw2) __attribute__((always_inline));
@@ -348,6 +574,35 @@ static inline void bfly3(__m128i *x0,__m128i *x1,__m128i *x2,
   *(y2) = _mm_adds_epi16(*(x0),*(y2));
 }
 
+#elif defined(__arm__)
+static inline void bfly3(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,
+                         int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,
+                         int16x8_t *tw1,int16x8_t *tw2) __attribute__((always_inline));
+
+static inline void bfly3(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,
+                         int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,
+                         int16x8_t *tw1,int16x8_t *tw2)
+{
+
+  int32x4_t tmpre,tmpim;
+  int16x8_t x1_2,x2_2;
+
+  packed_cmult(*(x1),*(tw1),&x1_2);
+  packed_cmult(*(x2),*(tw2),&x2_2);
+  *(y0)  = vqaddq_s16(*(x0),vqaddq_s16(x1_2,x2_2));
+  cmult(x1_2,*(W13),&tmpre,&tmpim);
+  cmac(x2_2,*(W23),&tmpre,&tmpim);
+  *(y1) = cpack(tmpre,tmpim);
+  *(y1) = vqaddq_s16(*(x0),*(y1));
+  cmult(x1_2,*(W23),&tmpre,&tmpim);
+  cmac(x2_2,*(W13),&tmpre,&tmpim);
+  *(y2) = cpack(tmpre,tmpim);
+  *(y2) = vqaddq_s16(*(x0),*(y2));
+}
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 static inline void ibfly3(__m128i *x0,__m128i *x1,__m128i *x2,
 			  __m128i *y0,__m128i *y1,__m128i *y2,
 			  __m128i *tw1,__m128i *tw2) __attribute__((always_inline));
@@ -372,6 +627,34 @@ static inline void ibfly3(__m128i *x0,__m128i *x1,__m128i *x2,
   *(y2) = _mm_adds_epi16(*(x0),*(y2));
 }
 
+#elif defined(__arm__)
+static inline void ibfly3(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,
+			  int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,
+			  int16x8_t *tw1,int16x8_t *tw2) __attribute__((always_inline));
+
+static inline void ibfly3(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,
+			  int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,
+			  int16x8_t *tw1,int16x8_t *tw2)
+{
+
+  int32x4_t tmpre,tmpim;
+  int16x8_t x1_2,x2_2;
+
+  packed_cmultc(*(x1),*(tw1),&x1_2);
+  packed_cmultc(*(x2),*(tw2),&x2_2);
+  *(y0)  = vqaddq_s16(*(x0),vqaddq_s16(x1_2,x2_2));
+  cmultc(x1_2,*(W13),&tmpre,&tmpim);
+  cmacc(x2_2,*(W23),&tmpre,&tmpim);
+  *(y1) = cpack(tmpre,tmpim);
+  *(y1) = vqaddq_s16(*(x0),*(y1));
+  cmultc(x1_2,*(W23),&tmpre,&tmpim);
+  cmacc(x2_2,*(W13),&tmpre,&tmpim);
+  *(y2) = cpack(tmpre,tmpim);
+  *(y2) = vqaddq_s16(*(x0),*(y2));
+}
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 static inline void bfly3_tw1(__m128i *x0,__m128i *x1,__m128i *x2,
                              __m128i *y0,__m128i *y1,__m128i *y2) __attribute__((always_inline));
 
@@ -391,8 +674,31 @@ static inline void bfly3_tw1(__m128i *x0,__m128i *x1,__m128i *x2,
   *(y2) = cpack(tmpre,tmpim);
   *(y2) = _mm_adds_epi16(*(x0),*(y2));
 }
+#elif defined(__arm__)
+static inline void bfly3_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,
+                             int16x8_t *y0,int16x8_t *y1,int16x8_t *y2) __attribute__((always_inline));
+
+static inline void bfly3_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,
+                             int16x8_t *y0,int16x8_t *y1,int16x8_t *y2)
+{
+
+  int32x4_t tmpre,tmpim;
+
+  *(y0) = vqaddq_s16(*(x0),vqaddq_s16(*(x1),*(x2)));
+  cmult(*(x1),*(W13),&tmpre,&tmpim);
+  cmac(*(x2),*(W23),&tmpre,&tmpim);
+  *(y1) = cpack(tmpre,tmpim);
+  *(y1) = vqaddq_s16(*(x0),*(y1));
+  cmult(*(x1),*(W23),&tmpre,&tmpim);
+  cmac(*(x2),*(W13),&tmpre,&tmpim);
+  *(y2) = cpack(tmpre,tmpim);
+  *(y2) = vqaddq_s16(*(x0),*(y2));
+
+}
 
+#endif
 
+#if defined(__x86_64__) || defined(__i386__)
 static inline void bfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
                          __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3,
                          __m128i *tw1,__m128i *tw2,__m128i *tw3)__attribute__((always_inline));
@@ -434,6 +740,51 @@ static inline void bfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
   *(y3) = _mm_add_epi16(*(x0),cpack(dy3r,dy3i));
 }
 
+#elif defined(__arm__)
+static inline void bfly4(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+                         int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3,
+                         int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3)__attribute__((always_inline));
+
+static inline void bfly4(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+                         int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3,
+                         int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3)
+{
+
+  int32x4_t x1r_2,x1i_2,x2r_2,x2i_2,x3r_2,x3i_2,dy0r,dy0i,dy1r,dy1i,dy2r,dy2i,dy3r,dy3i;
+
+  //  cmult(*(x0),*(W0),&x0r_2,&x0i_2);
+  cmult(*(x1),*(tw1),&x1r_2,&x1i_2);
+  cmult(*(x2),*(tw2),&x2r_2,&x2i_2);
+  cmult(*(x3),*(tw3),&x3r_2,&x3i_2);
+  //  dy0r = _mm_add_epi32(x0r_2,_mm_add_epi32(x1r_2,_mm_add_epi32(x2r_2,x3r_2)));
+  //  dy0i = _mm_add_epi32(x0i_2,_mm_add_epi32(x1i_2,_mm_add_epi32(x2i_2,x3i_2)));
+  //  *(y0)  = cpack(dy0r,dy0i);
+  dy0r = vqaddq_s32(x1r_2,vqaddq_s32(x2r_2,x3r_2));
+  dy0i = vqaddq_s32(x1i_2,vqaddq_s32(x2i_2,x3i_2));
+  *(y0)  = vqaddq_s16(*(x0),cpack(dy0r,dy0i));
+  //  dy1r = _mm_add_epi32(x0r_2,_mm_sub_epi32(x1i_2,_mm_add_epi32(x2r_2,x3i_2)));
+  //  dy1i = _mm_sub_epi32(x0i_2,_mm_add_epi32(x1r_2,_mm_sub_epi32(x2i_2,x3r_2)));
+  //  *(y1)  = cpack(dy1r,dy1i);
+  dy1r = vqsubq_s32(x1i_2,vqaddq_s32(x2r_2,x3i_2));
+  dy1i = vqsubq_s32(vqsubq_s32(x3r_2,x2i_2),x1r_2);
+  *(y1)  = vqaddq_s16(*(x0),cpack(dy1r,dy1i));
+  //  dy2r = _mm_sub_epi32(x0r_2,_mm_sub_epi32(x1r_2,_mm_sub_epi32(x2r_2,x3r_2)));
+  //  dy2i = _mm_sub_epi32(x0i_2,_mm_sub_epi32(x1i_2,_mm_sub_epi32(x2i_2,x3i_2)));
+  //  *(y2)  = cpack(dy2r,dy2i);
+  dy2r = vqsubq_s32(vqsubq_s32(x2r_2,x3r_2),x1r_2);
+  dy2i = vqsubq_s32(vqsubq_s32(x2i_2,x3i_2),x1i_2);
+  *(y2)  = vqaddq_s16(*(x0),cpack(dy2r,dy2i));
+  //  dy3r = _mm_sub_epi32(x0r_2,_mm_add_epi32(x1i_2,_mm_sub_epi32(x2r_2,x3i_2)));
+  //  dy3i = _mm_add_epi32(x0i_2,_mm_sub_epi32(x1r_2,_mm_add_epi32(x2i_2,x3r_2)));
+  //  *(y3) = cpack(dy3r,dy3i);
+  dy3r = vqsubq_s32(vqsubq_s32(x3i_2,x2r_2),x1i_2);
+  dy3i = vqsubq_s32(x1r_2,vqaddq_s32(x2i_2,x3r_2));
+  *(y3) = vqaddq_s16(*(x0),cpack(dy3r,dy3i));
+}
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 static inline void ibfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
                           __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3,
                           __m128i *tw1,__m128i *tw2,__m128i *tw3)__attribute__((always_inline));
@@ -445,24 +796,11 @@ static inline void ibfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
 
   __m128i x1r_2,x1i_2,x2r_2,x2i_2,x3r_2,x3i_2,dy0r,dy0i,dy1r,dy1i,dy2r,dy2i,dy3r,dy3i;
 
-  //  cmultc(*(x0),*(W0),&x0r_2,&x0i_2);
+
   cmultc(*(x1),*(tw1),&x1r_2,&x1i_2);
   cmultc(*(x2),*(tw2),&x2r_2,&x2i_2);
   cmultc(*(x3),*(tw3),&x3r_2,&x3i_2);
-  /*
-  dy0r = _mm_add_epi32(x0r_2,_mm_add_epi32(x1r_2,_mm_add_epi32(x2r_2,x3r_2)));
-  dy0i = _mm_add_epi32(x0i_2,_mm_add_epi32(x1i_2,_mm_add_epi32(x2i_2,x3i_2)));
-  *(y0)  = cpack(dy0r,dy0i);
-  dy3r = _mm_add_epi32(x0r_2,_mm_sub_epi32(x1i_2,_mm_add_epi32(x2r_2,x3i_2)));
-  dy3i = _mm_sub_epi32(x0i_2,_mm_add_epi32(x1r_2,_mm_sub_epi32(x2i_2,x3r_2)));
-  *(y3)  = cpack(dy3r,dy3i);
-  dy2r = _mm_sub_epi32(x0r_2,_mm_sub_epi32(x1r_2,_mm_sub_epi32(x2r_2,x3r_2)));
-  dy2i = _mm_sub_epi32(x0i_2,_mm_sub_epi32(x1i_2,_mm_sub_epi32(x2i_2,x3i_2)));
-  *(y2)  = cpack(dy2r,dy2i);
-  dy1r = _mm_sub_epi32(x0r_2,_mm_add_epi32(x1i_2,_mm_sub_epi32(x2r_2,x3i_2)));
-  dy1i = _mm_add_epi32(x0i_2,_mm_sub_epi32(x1r_2,_mm_add_epi32(x2i_2,x3r_2)));
-  *(y1) = cpack(dy1r,dy1i);
-  */
+
   dy0r = _mm_add_epi32(x1r_2,_mm_add_epi32(x2r_2,x3r_2));
   dy0i = _mm_add_epi32(x1i_2,_mm_add_epi32(x2i_2,x3i_2));
   *(y0)  = _mm_add_epi16(*(x0),cpack(dy0r,dy0i));
@@ -477,6 +815,41 @@ static inline void ibfly4(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
   *(y1) = _mm_add_epi16(*(x0),cpack(dy1r,dy1i));
 }
 
+#elif defined(__arm__)
+
+static inline void ibfly4(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+                          int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3,
+                          int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3)__attribute__((always_inline));
+
+static inline void ibfly4(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+                          int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3,
+                          int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3)
+{
+
+  int32x4_t x1r_2,x1i_2,x2r_2,x2i_2,x3r_2,x3i_2,dy0r,dy0i,dy1r,dy1i,dy2r,dy2i,dy3r,dy3i;
+
+
+  cmultc(*(x1),*(tw1),&x1r_2,&x1i_2);
+  cmultc(*(x2),*(tw2),&x2r_2,&x2i_2);
+  cmultc(*(x3),*(tw3),&x3r_2,&x3i_2);
+
+  dy0r  = vqaddq_s32(x1r_2,vqaddq_s32(x2r_2,x3r_2));
+  dy0i  = vqaddq_s32(x1i_2,vqaddq_s32(x2i_2,x3i_2));
+  *(y0) = vqaddq_s16(*(x0),cpack(dy0r,dy0i));
+  dy3r  = vqsubq_s32(x1i_2,vqaddq_s32(x2r_2,x3i_2));
+  dy3i  = vqsubq_s32(vqsubq_s32(x3r_2,x2i_2),x1r_2);
+  *(y3) = vqaddq_s16(*(x0),cpack(dy3r,dy3i));
+  dy2r  = vqsubq_s32(vqsubq_s32(x2r_2,x3r_2),x1r_2);
+  dy2i  = vqsubq_s32(vqsubq_s32(x2i_2,x3i_2),x1i_2);
+  *(y2) = vqaddq_s16(*(x0),cpack(dy2r,dy2i));
+  dy1r  = vqsubq_s32(vqsubq_s32(x3i_2,x2r_2),x1i_2);
+  dy1i  = vqsubq_s32(x1r_2,vqaddq_s32(x2i_2,x3r_2));
+  *(y1) = vqaddq_s16(*(x0),cpack(dy1r,dy1i));
+}
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 
 static inline void bfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
                              __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3)__attribute__((always_inline));
@@ -484,24 +857,42 @@ static inline void bfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
 static inline void bfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
                              __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3)
 {
-
   register __m128i x1_flip,x3_flip;
 
   *(y0) = _mm_adds_epi16(*(x0),_mm_adds_epi16(*(x1),_mm_adds_epi16(*(x2),*(x3))));
-
   x1_flip = _mm_sign_epi16(*(x1),*(__m128i*)conjugatedft);
-  //  x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
   x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   x3_flip = _mm_sign_epi16(*(x3),*(__m128i*)conjugatedft);
-  //  x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
   x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   *(y1)   = _mm_adds_epi16(*(x0),_mm_subs_epi16(x1_flip,_mm_adds_epi16(*(x2),x3_flip)));
   *(y2)   = _mm_subs_epi16(*(x0),_mm_subs_epi16(*(x1),_mm_subs_epi16(*(x2),*(x3))));
   *(y3)   = _mm_subs_epi16(*(x0),_mm_adds_epi16(x1_flip,_mm_subs_epi16(*(x2),x3_flip)));
+
 }
 
+#elif defined(__arm__)
+
+static inline void bfly4_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+                             int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3)__attribute__((always_inline));
+
+static inline void bfly4_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+                             int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3)
+{
+
+  register int16x8_t x1_flip,x3_flip;
+
+  *(y0) = vqaddq_s16(*(x0),vqaddq_s16(*(x1),vqaddq_s16(*(x2),*(x3))));
+  x1_flip = vrev32q_s16(vmulq_s16(*(x1),*(int16x8_t*)conjugatedft));
+  x3_flip = vrev32q_s16(vmulq_s16(*(x3),*(int16x8_t*)conjugatedft));
+  *(y1)   = vqaddq_s16(*(x0),vqsubq_s16(x1_flip,vqaddq_s16(*(x2),x3_flip)));
+  *(y2)   = vqsubq_s16(*(x0),vqsubq_s16(*(x1),vqsubq_s16(*(x2),*(x3))));
+  *(y3)   = vqsubq_s16(*(x0),vqaddq_s16(x1_flip,vqsubq_s16(*(x2),x3_flip)));
+}
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
+
 static inline void ibfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
                               __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3)__attribute__((always_inline));
 
@@ -526,6 +917,28 @@ static inline void ibfly4_tw1(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
   *(y3)   = _mm_adds_epi16(*(x0),_mm_subs_epi16(x1_flip,_mm_adds_epi16(*(x2),x3_flip)));
 }
 
+
+#elif defined(__arm__)
+static inline void ibfly4_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+			      int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3)__attribute__((always_inline));
+
+static inline void ibfly4_tw1(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+			      int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3)
+{
+
+  register int16x8_t x1_flip,x3_flip;
+
+  *(y0) = vqaddq_s16(*(x0),vqaddq_s16(*(x1),vqaddq_s16(*(x2),*(x3))));
+  x1_flip = vrev32q_s16(vmulq_s16(*(x1),*(int16x8_t*)conjugatedft));
+  x3_flip = vrev32q_s16(vmulq_s16(*(x3),*(int16x8_t*)conjugatedft));
+  *(y1)   = vqsubq_s16(*(x0),vqaddq_s16(x1_flip,vqsubq_s16(*(x2),x3_flip)));
+  *(y2)   = vqsubq_s16(*(x0),vqsubq_s16(*(x1),vqsubq_s16(*(x2),*(x3))));
+  *(y3)   = vqaddq_s16(*(x0),vqsubq_s16(x1_flip,vqaddq_s16(*(x2),x3_flip)));
+}
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 static inline void bfly4_16(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
                             __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3,
                             __m128i *tw1,__m128i *tw2,__m128i *tw3,
@@ -574,6 +987,42 @@ static inline void bfly4_16(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
 
 }
 
+#elif defined(__arm__)
+
+static inline void bfly4_16(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+                            int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3,
+                            int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3,
+                            int16x8_t *tw1b,int16x8_t *tw2b,int16x8_t *tw3b)__attribute__((always_inline));
+
+static inline void bfly4_16(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+                            int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3,
+                            int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3,
+                            int16x8_t *tw1b,int16x8_t *tw2b,int16x8_t *tw3b)
+{
+
+  register int16x8_t x1t,x2t,x3t,x02t,x13t;
+  register int16x8_t x1_flip,x3_flip;
+
+  x1t = packed_cmult2(*(x1),*(tw1),*(tw1b));
+  x2t = packed_cmult2(*(x2),*(tw2),*(tw2b));
+  x3t = packed_cmult2(*(x3),*(tw3),*(tw3b));
+
+
+
+  x02t  = vqaddq_s16(*(x0),x2t);
+  x13t  = vqaddq_s16(x1t,x3t);
+  *(y0)   = vqaddq_s16(x02t,x13t);
+  *(y2)   = vqsubq_s16(x02t,x13t);
+  x1_flip = vrev32q_s16(vmulq_s16(x1t,*(int16x8_t*)conjugatedft));
+  x3_flip = vrev32q_s16(vmulq_s16(x3t,*(int16x8_t*)conjugatedft));
+  x02t  = vqsubq_s16(*(x0),x2t);
+  x13t  = vqsubq_s16(x1_flip,x3_flip);
+  *(y1)   = vqaddq_s16(x02t,x13t);  // x0 + x1f - x2 - x3f
+  *(y3)   = vqsubq_s16(x02t,x13t);  // x0 - x1f - x2 + x3f
+}
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 static inline void ibfly4_16(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
                              __m128i *y0,__m128i *y1,__m128i *y2,__m128i *y3,
                              __m128i *tw1,__m128i *tw2,__m128i *tw3,
@@ -622,6 +1071,40 @@ static inline void ibfly4_16(__m128i *x0,__m128i *x1,__m128i *x2,__m128i *x3,
 
 }
 
+#elif defined(__arm__)
+static inline void ibfly4_16(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+			     int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3,
+			     int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3,
+			     int16x8_t *tw1b,int16x8_t *tw2b,int16x8_t *tw3b)__attribute__((always_inline));
+
+static inline void ibfly4_16(int16x8_t *x0,int16x8_t *x1,int16x8_t *x2,int16x8_t *x3,
+			     int16x8_t *y0,int16x8_t *y1,int16x8_t *y2,int16x8_t *y3,
+			     int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3,
+			     int16x8_t *tw1b,int16x8_t *tw2b,int16x8_t *tw3b)
+{
+
+  register int16x8_t x1t,x2t,x3t,x02t,x13t;
+  register int16x8_t x1_flip,x3_flip;
+
+  x1t = packed_cmult2(*(x1),*(tw1),*(tw1b));
+  x2t = packed_cmult2(*(x2),*(tw2),*(tw2b));
+  x3t = packed_cmult2(*(x3),*(tw3),*(tw3b));
+
+  x02t    = vqaddq_s16(*(x0),x2t);
+  x13t    = vqaddq_s16(x1t,x3t);
+  *(y0)   = vqaddq_s16(x02t,x13t);
+  *(y2)   = vqsubq_s16(x02t,x13t);
+  x1_flip = vrev32q_s16(vmulq_s16(x1t,*(int16x8_t*)conjugatedft));
+  x3_flip = vrev32q_s16(vmulq_s16(x3t,*(int16x8_t*)conjugatedft));
+  x02t    = vqsubq_s16(*(x0),x2t);
+  x13t    = vqsubq_s16(x1_flip,x3_flip);
+  *(y3)   = vqaddq_s16(x02t,x13t);  // x0 - x1f - x2 + x3f
+  *(y1)   = vqsubq_s16(x02t,x13t);  // x0 + x1f - x2 - x3f
+}
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
 static inline void bfly5(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3,__m128i *x4,
                          __m128i *y0, __m128i *y1, __m128i *y2, __m128i *y3,__m128i *y4,
                          __m128i *tw1,__m128i *tw2,__m128i *tw3,__m128i *tw4)__attribute__((always_inline));
@@ -670,10 +1153,64 @@ static inline void bfly5(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3,__m1
   *(y4) = _mm_adds_epi16(*(x0),*(y4));
 
 
+}
+
+#elif defined(__arm__)
+static inline void bfly5(int16x8_t *x0, int16x8_t *x1, int16x8_t *x2, int16x8_t *x3,int16x8_t *x4,
+                         int16x8_t *y0, int16x8_t *y1, int16x8_t *y2, int16x8_t *y3,int16x8_t *y4,
+                         int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3,int16x8_t *tw4)__attribute__((always_inline));
+
+static inline void bfly5(int16x8_t *x0, int16x8_t *x1, int16x8_t *x2, int16x8_t *x3,int16x8_t *x4,
+                         int16x8_t *y0, int16x8_t *y1, int16x8_t *y2, int16x8_t *y3,int16x8_t *y4,
+                         int16x8_t *tw1,int16x8_t *tw2,int16x8_t *tw3,int16x8_t *tw4)
+{
+
+
+
+  int16x8_t x1_2,x2_2,x3_2,x4_2;
+  int32x4_t tmpre,tmpim;
+
+  packed_cmult(*(x1),*(tw1),&x1_2);
+  packed_cmult(*(x2),*(tw2),&x2_2);
+  packed_cmult(*(x3),*(tw3),&x3_2);
+  packed_cmult(*(x4),*(tw4),&x4_2);
+
+  *(y0)  = vqaddq_s16(*(x0),vqaddq_s16(x1_2,vqaddq_s16(x2_2,vqaddq_s16(x3_2,x4_2))));
+  cmult(x1_2,*(W15),&tmpre,&tmpim);
+  cmac(x2_2,*(W25),&tmpre,&tmpim);
+  cmac(x3_2,*(W35),&tmpre,&tmpim);
+  cmac(x4_2,*(W45),&tmpre,&tmpim);
+  *(y1) = cpack(tmpre,tmpim);
+  *(y1) = vqaddq_s16(*(x0),*(y1));
+
+  cmult(x1_2,*(W25),&tmpre,&tmpim);
+  cmac(x2_2,*(W45),&tmpre,&tmpim);
+  cmac(x3_2,*(W15),&tmpre,&tmpim);
+  cmac(x4_2,*(W35),&tmpre,&tmpim);
+  *(y2) = cpack(tmpre,tmpim);
+  *(y2) = vqaddq_s16(*(x0),*(y2));
+
+  cmult(x1_2,*(W35),&tmpre,&tmpim);
+  cmac(x2_2,*(W15),&tmpre,&tmpim);
+  cmac(x3_2,*(W45),&tmpre,&tmpim);
+  cmac(x4_2,*(W25),&tmpre,&tmpim);
+  *(y3) = cpack(tmpre,tmpim);
+  *(y3) = vqaddq_s16(*(x0),*(y3));
+
+  cmult(x1_2,*(W45),&tmpre,&tmpim);
+  cmac(x2_2,*(W35),&tmpre,&tmpim);
+  cmac(x3_2,*(W25),&tmpre,&tmpim);
+  cmac(x4_2,*(W15),&tmpre,&tmpim);
+  *(y4) = cpack(tmpre,tmpim);
+  *(y4) = vqaddq_s16(*(x0),*(y4));
+
+
 }
 
 
+#endif
 
+#if defined(__x86_64__) || defined(__i386__)
 static inline void bfly5_tw1(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3,__m128i *x4,
                              __m128i *y0, __m128i *y1, __m128i *y2, __m128i *y3,__m128i *y4) __attribute__((always_inline));
 
@@ -710,9 +1247,48 @@ static inline void bfly5_tw1(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3,
   *(y4) = _mm_adds_epi16(*(x0),*(y4));
 }
 
+#elif defined(__arm__)
+static inline void bfly5_tw1(int16x8_t *x0, int16x8_t *x1, int16x8_t *x2, int16x8_t *x3,int16x8_t *x4,
+                             int16x8_t *y0, int16x8_t *y1, int16x8_t *y2, int16x8_t *y3,int16x8_t *y4) __attribute__((always_inline));
+
+static inline void bfly5_tw1(int16x8_t *x0, int16x8_t *x1, int16x8_t *x2, int16x8_t *x3,int16x8_t *x4,
+                             int16x8_t *y0, int16x8_t *y1, int16x8_t *y2, int16x8_t *y3,int16x8_t *y4)
+{
+
+  int32x4_t tmpre,tmpim;
+
+  *(y0) = vqaddq_s16(*(x0),vqaddq_s16(*(x1),vqaddq_s16(*(x2),vqaddq_s16(*(x3),*(x4)))));
+  cmult(*(x1),*(W15),&tmpre,&tmpim);
+  cmac(*(x2),*(W25),&tmpre,&tmpim);
+  cmac(*(x3),*(W35),&tmpre,&tmpim);
+  cmac(*(x4),*(W45),&tmpre,&tmpim);
+  *(y1) = cpack(tmpre,tmpim);
+  *(y1) = vqaddq_s16(*(x0),*(y1));
+  cmult(*(x1),*(W25),&tmpre,&tmpim);
+  cmac(*(x2),*(W45),&tmpre,&tmpim);
+  cmac(*(x3),*(W15),&tmpre,&tmpim);
+  cmac(*(x4),*(W35),&tmpre,&tmpim);
+  *(y2) = cpack(tmpre,tmpim);
+  *(y2) = vqaddq_s16(*(x0),*(y2));
+  cmult(*(x1),*(W35),&tmpre,&tmpim);
+  cmac(*(x2),*(W15),&tmpre,&tmpim);
+  cmac(*(x3),*(W45),&tmpre,&tmpim);
+  cmac(*(x4),*(W25),&tmpre,&tmpim);
+  *(y3) = cpack(tmpre,tmpim);
+  *(y3) = vqaddq_s16(*(x0),*(y3));
+  cmult(*(x1),*(W45),&tmpre,&tmpim);
+  cmac(*(x2),*(W35),&tmpre,&tmpim);
+  cmac(*(x3),*(W25),&tmpre,&tmpim);
+  cmac(*(x4),*(W15),&tmpre,&tmpim);
+  *(y4) = cpack(tmpre,tmpim);
+  *(y4) = vqaddq_s16(*(x0),*(y4));
+}
+
+#endif
 // performs 4x4 transpose of input x (complex interleaved) using 128bit SIMD intrinsics
 // i.e. x = [x0r x0i x1r x1i ... x15r x15i], y = [x0r x0i x4r x4i x8r x8i x12r x12i x1r x1i x5r x5i x9r x9i x13r x13i x2r x2i ... x15r x15i]
 
+#if defined(__x86_64__) || defined(__i386__)
 static inline void transpose16(__m128i *x,__m128i *y) __attribute__((always_inline));
 static inline void transpose16(__m128i *x,__m128i *y)
 {
@@ -728,7 +1304,24 @@ static inline void transpose16(__m128i *x,__m128i *y)
   y[3]    = _mm_unpackhi_epi64(ytmp1,ytmp3);
 }
 
+#elif defined(__arm__)
+static inline void transpose16(int16x8_t *x,int16x8_t *y) __attribute__((always_inline));
+static inline void transpose16(int16x8_t *x,int16x8_t *y)
+{
+  register uint32x4x2_t ytmp0,ytmp1;
+
+  ytmp0 = vtrnq_u32((uint32x4_t)(x[0]),(uint32x4_t)(x[1]));
+  ytmp1 = vtrnq_u32((uint32x4_t)(x[2]),(uint32x4_t)(x[3]));
+
+  y[0]  = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[0]),vget_low_s16((int16x8_t)ytmp1.val[0]));
+  y[1]  = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[0]),vget_high_s16((int16x8_t)ytmp1.val[0]));
+  y[2]  = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[1]),vget_low_s16((int16x8_t)ytmp1.val[1]));
+  y[3]  = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[1]),vget_high_s16((int16x8_t)ytmp1.val[1]));
+}
+
+# endif
 // same as above but output is offset by off
+#if defined(__x86_64__) || defined(__i386__)
 static inline void transpose16_ooff(__m128i *x,__m128i *y,int off) __attribute__((always_inline));
 
 static inline void transpose16_ooff(__m128i *x,__m128i *y,int off)
@@ -749,12 +1342,47 @@ static inline void transpose16_ooff(__m128i *x,__m128i *y,int off)
   *y2     = _mm_unpackhi_epi64(ytmp1,ytmp3);
 }
 
+#elif defined(__arm__)
+static inline void transpose16_ooff(int16x8_t *x,int16x8_t *y,int off) __attribute__((always_inline));
+
+static inline void transpose16_ooff(int16x8_t *x,int16x8_t *y,int off)
+{
+  int16x8_t *y2=y;
+  register uint32x4x2_t ytmp0,ytmp1;
+
+  ytmp0 = vtrnq_u32((uint32x4_t)(x[0]),(uint32x4_t)(x[1]));
+  ytmp1 = vtrnq_u32((uint32x4_t)(x[2]),(uint32x4_t)(x[3]));
+
+  *y2   = (int16x8_t)vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[0]),vget_low_s16((int16x8_t)ytmp1.val[0])); y2+=off;
+  *y2   = (int16x8_t)vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[1]),vget_low_s16((int16x8_t)ytmp1.val[1])); y2+=off;
+  *y2   = (int16x8_t)vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[0]),vget_high_s16((int16x8_t)ytmp1.val[0])); y2+=off;
+  *y2   = (int16x8_t)vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[1]),vget_high_s16((int16x8_t)ytmp1.val[1]));
+
+
+}
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
+
 static inline void transpose4_ooff(__m64 *x,__m64 *y,int off)__attribute__((always_inline));
 static inline void transpose4_ooff(__m64 *x,__m64 *y,int off)
 {
   y[0]   = _mm_unpacklo_pi32(x[0],x[1]);
   y[off] = _mm_unpackhi_pi32(x[0],x[1]);
 }
+#elif (__arm__)
+
+static inline void transpose4_ooff(int16x4_t *x,int16x4_t *y,int off)__attribute__((always_inline));
+static inline void transpose4_ooff(int16x4_t *x,int16x4_t *y,int off)
+{
+  uint32x2x2_t ytmp = vtrn_u32((uint32x2_t)x[0],(uint32x2_t)x[1]);
+
+  y[0]   = (int16x4_t)ytmp.val[0];
+  y[off] = (int16x4_t)ytmp.val[1];
+}
+
+#endif
 
 // 16-point optimized DFT kernel
 
@@ -778,14 +1406,19 @@ int16_t tw16c[24] __attribute__((aligned(16))) = { 0,32767,12540,30272,23170,231
                                                    0,32767,30273,12539,23170,-23170,-12539,-30273
                                                  };
 
+
+
 static inline void dft16(int16_t *x,int16_t *y) __attribute__((always_inline));
 
 static inline void dft16(int16_t *x,int16_t *y)
 {
 
+#if defined(__x86_64__) || defined(__i386__)
+
   __m128i *tw16a_128=(__m128i *)tw16a,*tw16b_128=(__m128i *)tw16b,*x128=(__m128i *)x,*y128=(__m128i *)y;
 
-  /*
+  /*  This is the original version before unrolling
+
   bfly4_tw1(x128,x128+1,x128+2,x128+3,
       y128,y128+1,y128+2,y128+3);
 
@@ -805,27 +1438,14 @@ static inline void dft16(int16_t *x,int16_t *y)
   x13t    = _mm_adds_epi16(x128[1],x128[3]);
   xtmp0   = _mm_adds_epi16(x02t,x13t);
   xtmp2   = _mm_subs_epi16(x02t,x13t);
-
-  /*
-  xtmp0   = _mm_adds_epi16(x128[0],_mm_adds_epi16(x128[1],_mm_adds_epi16(x128[2],x128[3])));
-  xtmp2   = _mm_subs_epi16(x128[0],_mm_subs_epi16(x128[1],_mm_subs_epi16(x128[2],x128[3])));
-  */
   x1_flip = _mm_sign_epi16(x128[1],*(__m128i*)conjugatedft);
-  //  x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
   x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   x3_flip = _mm_sign_epi16(x128[3],*(__m128i*)conjugatedft);
-  //  x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
   x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   x02t    = _mm_subs_epi16(x128[0],x128[2]);
   x13t    = _mm_subs_epi16(x1_flip,x3_flip);
   xtmp1   = _mm_adds_epi16(x02t,x13t);  // x0 + x1f - x2 - x3f
   xtmp3   = _mm_subs_epi16(x02t,x13t);  // x0 - x1f - x2 + x3f
-  /*
-  xtmp1   = _mm_adds_epi16(x128[0],_mm_subs_epi16(x1_flip,_mm_adds_epi16(x128[2],x3_flip)));
-  xtmp3   = _mm_subs_epi16(x128[0],_mm_adds_epi16(x1_flip,_mm_subs_epi16(x128[2],x3_flip)));
-  */
 
   ytmp0   = _mm_unpacklo_epi32(xtmp0,xtmp1);
   ytmp1   = _mm_unpackhi_epi32(xtmp0,xtmp1);
@@ -845,28 +1465,84 @@ static inline void dft16(int16_t *x,int16_t *y)
   x13t    = _mm_adds_epi16(xtmp1,xtmp3);
   y128[0] = _mm_adds_epi16(x02t,x13t);
   y128[2] = _mm_subs_epi16(x02t,x13t);
-
-  /*
-  y128[0] = _mm_adds_epi16(xtmp0,_mm_adds_epi16(xtmp1,_mm_adds_epi16(xtmp2,xtmp3)));
-  y128[2] = _mm_subs_epi16(xtmp0,_mm_subs_epi16(xtmp1,_mm_subs_epi16(xtmp2,xtmp3)));
-  */
-
   x1_flip = _mm_sign_epi16(xtmp1,*(__m128i*)conjugatedft);
-  //  x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
   x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   x3_flip = _mm_sign_epi16(xtmp3,*(__m128i*)conjugatedft);
-  //  x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
   x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   x02t    = _mm_subs_epi16(xtmp0,xtmp2);
   x13t    = _mm_subs_epi16(x1_flip,x3_flip);
   y128[1] = _mm_adds_epi16(x02t,x13t);  // x0 + x1f - x2 - x3f
   y128[3] = _mm_subs_epi16(x02t,x13t);  // x0 - x1f - x2 + x3f
-  /*
-  y128[1] = _mm_adds_epi16(xtmp0,_mm_subs_epi16(x1_flip,_mm_adds_epi16(xtmp2,x3_flip)));
-  y128[3] = _mm_subs_epi16(xtmp0,_mm_adds_epi16(x1_flip,_mm_subs_epi16(xtmp2,x3_flip)));
+
+
+#elif defined(__arm__)
+
+  int16x8_t *tw16a_128=(int16x8_t *)tw16a,*tw16b_128=(int16x8_t *)tw16b,*x128=(int16x8_t *)x,*y128=(int16x8_t *)y;
+
+  /*  This is the original version before unrolling
+
+  bfly4_tw1(x128,x128+1,x128+2,x128+3,
+      y128,y128+1,y128+2,y128+3);
+
+  transpose16(y128,ytmp);
+
+  bfly4_16(ytmp,ytmp+1,ytmp+2,ytmp+3,
+     y128,y128+1,y128+2,y128+3,
+     tw16_128,tw16_128+1,tw16_128+2);
   */
+
+  register int16x8_t x1_flip,x3_flip,x02t,x13t;
+  register int16x8_t xtmp0,xtmp1,xtmp2,xtmp3;
+  register uint32x4x2_t ytmp0,ytmp1;
+  register int16x8_t ytmp0b,ytmp1b,ytmp2b,ytmp3b;
+
+  // First stage : 4 Radix-4 butterflies without input twiddles
+  
+  x02t    = vqaddq_s16(x128[0],x128[2]);
+  x13t    = vqaddq_s16(x128[1],x128[3]);
+  xtmp0   = vqaddq_s16(x02t,x13t);
+  xtmp2   = vqsubq_s16(x02t,x13t);
+  x1_flip = vrev32q_s16(vmulq_s16(x128[1],*(int16x8_t*)conjugatedft));
+  x3_flip = vrev32q_s16(vmulq_s16(x128[3],*(int16x8_t*)conjugatedft));
+  x02t    = vqsubq_s16(x128[0],x128[2]);
+  x13t    = vqsubq_s16(x1_flip,x3_flip);
+  xtmp1   = vqaddq_s16(x02t,x13t);  // x0 + x1f - x2 - x3f
+  xtmp3   = vqsubq_s16(x02t,x13t);  // x0 - x1f - x2 + x3f
+
+  ytmp0  = vtrnq_u32((uint32x4_t)(xtmp0),(uint32x4_t)(xtmp1));
+// y0[0] = [x00 x10 x02 x12], y0[1] = [x01 x11 x03 x13]
+  ytmp1  = vtrnq_u32((uint32x4_t)(xtmp2),(uint32x4_t)(xtmp3));
+// y1[0] = [x20 x30 x22 x32], y1[1] = [x21 x31 x23 x33]
+
+
+  ytmp0b = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[0]),vget_low_s16((int16x8_t)ytmp1.val[0]));
+// y0 = [x00 x10 x20 x30] 
+  ytmp1b = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[1]),vget_low_s16((int16x8_t)ytmp1.val[1]));
+// t1 = [x01 x11 x21 x31] 
+  ytmp2b = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[0]),vget_high_s16((int16x8_t)ytmp1.val[0]));
+// t2 = [x02 x12 x22 x32]
+  ytmp3b = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[1]),vget_high_s16((int16x8_t)ytmp1.val[1]));
+// t3 = [x03 x13 x23 x33]
+
+
+  // Second stage : 4 Radix-4 butterflies with input twiddles
+  xtmp1 = packed_cmult2(ytmp1b,tw16a_128[0],tw16b_128[0]);
+  xtmp2 = packed_cmult2(ytmp2b,tw16a_128[1],tw16b_128[1]);
+  xtmp3 = packed_cmult2(ytmp3b,tw16a_128[2],tw16b_128[2]);
+
+  x02t    = vqaddq_s16(ytmp0b,xtmp2);
+  x13t    = vqaddq_s16(xtmp1,xtmp3);
+  y128[0] = vqaddq_s16(x02t,x13t);
+  y128[2] = vqsubq_s16(x02t,x13t);
+  x1_flip = vrev32q_s16(vmulq_s16(xtmp1,*(int16x8_t*)conjugatedft));
+  x3_flip = vrev32q_s16(vmulq_s16(xtmp3,*(int16x8_t*)conjugatedft));
+  x02t    = vqsubq_s16(ytmp0b,xtmp2);
+  x13t    = vqsubq_s16(x1_flip,x3_flip);
+  y128[1] = vqaddq_s16(x02t,x13t);  // x0 + x1f - x2 - x3f
+  y128[3] = vqsubq_s16(x02t,x13t);  // x0 - x1f - x2 + x3f
+
+
+#endif
 }
 
 static inline void idft16(int16_t *x,int16_t *y) __attribute__((always_inline));
@@ -874,6 +1550,7 @@ static inline void idft16(int16_t *x,int16_t *y) __attribute__((always_inline));
 static inline void idft16(int16_t *x,int16_t *y)
 {
 
+#if defined(__x86_64__) || defined(__i386__)
   __m128i *tw16a_128=(__m128i *)tw16,*tw16b_128=(__m128i *)tw16c,*x128=(__m128i *)x,*y128=(__m128i *)y;
 
   /*
@@ -896,27 +1573,14 @@ static inline void idft16(int16_t *x,int16_t *y)
   x13t    = _mm_adds_epi16(x128[1],x128[3]);
   xtmp0   = _mm_adds_epi16(x02t,x13t);
   xtmp2   = _mm_subs_epi16(x02t,x13t);
-
-  /*
-  xtmp0   = _mm_adds_epi16(x128[0],_mm_adds_epi16(x128[1],_mm_adds_epi16(x128[2],x128[3])));
-  xtmp2   = _mm_subs_epi16(x128[0],_mm_subs_epi16(x128[1],_mm_subs_epi16(x128[2],x128[3])));
-  */
   x1_flip = _mm_sign_epi16(x128[1],*(__m128i*)conjugatedft);
-  //  x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
   x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   x3_flip = _mm_sign_epi16(x128[3],*(__m128i*)conjugatedft);
-  //  x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
   x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   x02t    = _mm_subs_epi16(x128[0],x128[2]);
   x13t    = _mm_subs_epi16(x1_flip,x3_flip);
   xtmp3   = _mm_adds_epi16(x02t,x13t);  // x0 + x1f - x2 - x3f
   xtmp1   = _mm_subs_epi16(x02t,x13t);  // x0 - x1f - x2 + x3f
-  /*
-  xtmp1   = _mm_adds_epi16(x128[0],_mm_subs_epi16(x1_flip,_mm_adds_epi16(x128[2],x3_flip)));
-  xtmp3   = _mm_subs_epi16(x128[0],_mm_adds_epi16(x1_flip,_mm_subs_epi16(x128[2],x3_flip)));
-  */
 
   ytmp0   = _mm_unpacklo_epi32(xtmp0,xtmp1);
   ytmp1   = _mm_unpackhi_epi32(xtmp0,xtmp1);
@@ -936,53 +1600,84 @@ static inline void idft16(int16_t *x,int16_t *y)
   x13t    = _mm_adds_epi16(xtmp1,xtmp3);
   y128[0] = _mm_adds_epi16(x02t,x13t);
   y128[2] = _mm_subs_epi16(x02t,x13t);
-
-  /*
-  y128[0] = _mm_adds_epi16(xtmp0,_mm_adds_epi16(xtmp1,_mm_adds_epi16(xtmp2,xtmp3)));
-  y128[2] = _mm_subs_epi16(xtmp0,_mm_subs_epi16(xtmp1,_mm_subs_epi16(xtmp2,xtmp3)));
-  */
-
   x1_flip = _mm_sign_epi16(xtmp1,*(__m128i*)conjugatedft);
-  //  x1_flip = _mm_shufflelo_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x1_flip = _mm_shufflehi_epi16(x1_flip,_MM_SHUFFLE(2,3,0,1));
   x1_flip = _mm_shuffle_epi8(x1_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   x3_flip = _mm_sign_epi16(xtmp3,*(__m128i*)conjugatedft);
-  //  x3_flip = _mm_shufflelo_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
-  //  x3_flip = _mm_shufflehi_epi16(x3_flip,_MM_SHUFFLE(2,3,0,1));
   x3_flip = _mm_shuffle_epi8(x3_flip,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2));
   x02t    = _mm_subs_epi16(xtmp0,xtmp2);
   x13t    = _mm_subs_epi16(x1_flip,x3_flip);
   y128[3] = _mm_adds_epi16(x02t,x13t);  // x0 + x1f - x2 - x3f
   y128[1] = _mm_subs_epi16(x02t,x13t);  // x0 - x1f - x2 + x3f
-  /*
-  y128[1] = _mm_adds_epi16(xtmp0,_mm_subs_epi16(x1_flip,_mm_adds_epi16(xtmp2,x3_flip)));
-  y128[3] = _mm_subs_epi16(xtmp0,_mm_adds_epi16(x1_flip,_mm_subs_epi16(xtmp2,x3_flip)));
-  */
-}
-
-/*
-static inline void idft16(int16_t *x,int16_t *y)__attribute__((always_inline));
 
-static inline void idft16(int16_t *x,int16_t *y) {
+#elif defined(__arm__)
+  int16x8_t *tw16a_128=(int16x8_t *)tw16,*tw16b_128=(int16x8_t *)tw16c,*x128=(int16x8_t *)x,*y128=(int16x8_t *)y;
 
-  __m128i ytmp[4],*tw16_128=(__m128i *)tw16,*x128=(__m128i *)x,*y128=(__m128i *)y;
+  /*  This is the original version before unrolling
 
-
-  ibfly4_tw1(x128,x128+1,x128+2,x128+3,
+  bfly4_tw1(x128,x128+1,x128+2,x128+3,
       y128,y128+1,y128+2,y128+3);
 
   transpose16(y128,ytmp);
 
-  ibfly4(ytmp,ytmp+1,ytmp+2,ytmp+3,
-   y128,y128+1,y128+2,y128+3,
-   tw16_128,tw16_128+1,tw16_128+2);
+  bfly4_16(ytmp,ytmp+1,ytmp+2,ytmp+3,
+     y128,y128+1,y128+2,y128+3,
+     tw16_128,tw16_128+1,tw16_128+2);
+  */
+
+  register int16x8_t x1_flip,x3_flip,x02t,x13t;
+  register int16x8_t xtmp0,xtmp1,xtmp2,xtmp3;
+  register uint32x4x2_t ytmp0,ytmp1;
+  register int16x8_t ytmp0b,ytmp1b,ytmp2b,ytmp3b;
 
-}
-*/
+  // First stage : 4 Radix-4 butterflies without input twiddles
 
+  x02t    = vqaddq_s16(x128[0],x128[2]);
+  x13t    = vqaddq_s16(x128[1],x128[3]);
+  xtmp0   = vqaddq_s16(x02t,x13t);
+  xtmp2   = vqsubq_s16(x02t,x13t);
+  x1_flip = vrev32q_s16(vmulq_s16(x128[1],*(int16x8_t*)conjugatedft));
+  x3_flip = vrev32q_s16(vmulq_s16(x128[3],*(int16x8_t*)conjugatedft));
+  x02t    = vqsubq_s16(x128[0],x128[2]);
+  x13t    = vqsubq_s16(x1_flip,x3_flip);
+  xtmp3   = vqaddq_s16(x02t,x13t);  // x0 + x1f - x2 - x3f
+  xtmp1   = vqsubq_s16(x02t,x13t);  // x0 - x1f - x2 + x3f
+
+  ytmp0  = vtrnq_u32((uint32x4_t)(xtmp0),(uint32x4_t)(xtmp1));
+// y0[0] = [x00 x10 x02 x12], y0[1] = [x01 x11 x03 x13]
+  ytmp1  = vtrnq_u32((uint32x4_t)(xtmp2),(uint32x4_t)(xtmp3));
+// y1[0] = [x20 x30 x22 x32], y1[1] = [x21 x31 x23 x33]
+
+
+  ytmp0b = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[0]),vget_low_s16((int16x8_t)ytmp1.val[0]));
+// y0 = [x00 x10 x20 x30] 
+  ytmp1b = vcombine_s16(vget_low_s16((int16x8_t)ytmp0.val[1]),vget_low_s16((int16x8_t)ytmp1.val[1]));
+// t1 = [x01 x11 x21 x31] 
+  ytmp2b = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[0]),vget_high_s16((int16x8_t)ytmp1.val[0]));
+// t2 = [x02 x12 x22 x32]
+  ytmp3b = vcombine_s16(vget_high_s16((int16x8_t)ytmp0.val[1]),vget_high_s16((int16x8_t)ytmp1.val[1]));
+// t3 = [x03 x13 x23 x33]
 
+  // Second stage : 4 Radix-4 butterflies with input twiddles
+  xtmp1 = packed_cmult2(ytmp1b,tw16a_128[0],tw16b_128[0]);
+  xtmp2 = packed_cmult2(ytmp2b,tw16a_128[1],tw16b_128[1]);
+  xtmp3 = packed_cmult2(ytmp3b,tw16a_128[2],tw16b_128[2]);
+
+  x02t    = vqaddq_s16(ytmp0b,xtmp2);
+  x13t    = vqaddq_s16(xtmp1,xtmp3);
+  y128[0] = vqaddq_s16(x02t,x13t);
+  y128[2] = vqsubq_s16(x02t,x13t);
+  x1_flip = vrev32q_s16(vmulq_s16(xtmp1,*(int16x8_t*)conjugatedft));
+  x3_flip = vrev32q_s16(vmulq_s16(xtmp3,*(int16x8_t*)conjugatedft));
+  x02t    = vqsubq_s16(ytmp0b,xtmp2);
+  x13t    = vqsubq_s16(x1_flip,x3_flip);
+  y128[3] = vqaddq_s16(x02t,x13t);  // x0 + x1f - x2 - x3f
+  y128[1] = vqsubq_s16(x02t,x13t);  // x0 - x1f - x2 + x3f
 
-// 64-point optimized DFT kernel
+#endif
+}
+
+
+// 64-point optimized DFT
 
 int16_t tw64[96] __attribute__((aligned(16))) = { 32767,0,32609,-3212,32137,-6393,31356,-9512,30272,-12540,28897,-15447,27244,-18205,25329,-20788,23169,-23170,20787,-25330,18204,-27245,15446,-28898,12539,-30273,9511,-31357,6392,-32138,3211,-32610,
                                                   32767,0,32137,-6393,30272,-12540,27244,-18205,23169,-23170,18204,-27245,12539,-30273,6392,-32138,0,-32767,-6393,-32138,-12540,-30273,-18205,-27245,-23170,-23170,-27245,-18205,-30273,-12540,-32138,-6393,
@@ -1005,11 +1700,26 @@ int16_t tw64c[96] __attribute__((aligned(16))) = { 0,32767,3212,32609,6393,32137
                                                  };
 
 
+#if defined(__x86_64__) || defined(__i386__)
+#define simd_q15_t __m128i
+#define simdshort_q15_t __m64
+#define shiftright_int16(a,shift) _mm_srai_epi16(a,shift)
+#define set1_int16(a) _mm_set1_epi16(a);
+#define mulhi_int16(a,b) _mm_slli_epi16(_mm_mulhi_epi16(a,b),1);
+#elif defined(__arm__)
+#define simd_q15_t int16x8_t
+#define simdshort_q15_t int16x4_t
+#define shiftright_int16(a,shift) vshrq_n_s16(a,shift)
+#define set1_int16(a) vdupq_n_s16(a)
+#define mulhi_int16(a,b) vqdmulhq_s16(a,b);
+#define _mm_empty() 
+#define _m_empty()
+#endif
 
 void dft64(int16_t *x,int16_t *y,int scale)
 {
 
-  __m128i xtmp[16],ytmp[16],*tw64a_128=(__m128i *)tw64a,*tw64b_128=(__m128i *)tw64b,*x128=(__m128i *)x,*y128=(__m128i *)y;
+  simd_q15_t xtmp[16],ytmp[16],*tw64a_128=(simd_q15_t *)tw64a,*tw64b_128=(simd_q15_t *)tw64b,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y;
 
 
 #ifdef D64STATS
@@ -1073,24 +1783,22 @@ void dft64(int16_t *x,int16_t *y,int scale)
 
 
   if (scale>0) {
-
-    y128[0]  = _mm_srai_epi16(y128[0],3);
-    y128[1]  = _mm_srai_epi16(y128[1],3);
-    y128[2]  = _mm_srai_epi16(y128[2],3);
-    y128[3]  = _mm_srai_epi16(y128[3],3);
-    y128[4]  = _mm_srai_epi16(y128[4],3);
-    y128[5]  = _mm_srai_epi16(y128[5],3);
-    y128[6]  = _mm_srai_epi16(y128[6],3);
-    y128[7]  = _mm_srai_epi16(y128[7],3);
-    y128[8]  = _mm_srai_epi16(y128[8],3);
-    y128[9]  = _mm_srai_epi16(y128[9],3);
-    y128[10] = _mm_srai_epi16(y128[10],3);
-    y128[11] = _mm_srai_epi16(y128[11],3);
-    y128[12] = _mm_srai_epi16(y128[12],3);
-    y128[13] = _mm_srai_epi16(y128[13],3);
-    y128[14] = _mm_srai_epi16(y128[14],3);
-    y128[15] = _mm_srai_epi16(y128[15],3);
-
+    y128[0]  = shiftright_int16(y128[0],3);
+    y128[1]  = shiftright_int16(y128[1],3);
+    y128[2]  = shiftright_int16(y128[2],3);
+    y128[3]  = shiftright_int16(y128[3],3);
+    y128[4]  = shiftright_int16(y128[4],3);
+    y128[5]  = shiftright_int16(y128[5],3);
+    y128[6]  = shiftright_int16(y128[6],3);
+    y128[7]  = shiftright_int16(y128[7],3);
+    y128[8]  = shiftright_int16(y128[8],3);
+    y128[9]  = shiftright_int16(y128[9],3);
+    y128[10] = shiftright_int16(y128[10],3);
+    y128[11] = shiftright_int16(y128[11],3);
+    y128[12] = shiftright_int16(y128[12],3);
+    y128[13] = shiftright_int16(y128[13],3);
+    y128[14] = shiftright_int16(y128[14],3);
+    y128[15] = shiftright_int16(y128[15],3);
   }
 
   _mm_empty();
@@ -1101,7 +1809,7 @@ void dft64(int16_t *x,int16_t *y,int scale)
 void idft64(int16_t *x,int16_t *y,int scale)
 {
 
-  __m128i xtmp[16],ytmp[16],*tw64a_128=(__m128i *)tw64,*tw64b_128=(__m128i *)tw64c,*x128=(__m128i *)x,*y128=(__m128i *)y;
+  simd_q15_t xtmp[16],ytmp[16],*tw64a_128=(simd_q15_t *)tw64,*tw64b_128=(simd_q15_t *)tw64c,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y;
 
 
 #ifdef D64STATS
@@ -1166,22 +1874,22 @@ void idft64(int16_t *x,int16_t *y,int scale)
 
   if (scale>0) {
 
-    y128[0]  = _mm_srai_epi16(y128[0],3);
-    y128[1]  = _mm_srai_epi16(y128[1],3);
-    y128[2]  = _mm_srai_epi16(y128[2],3);
-    y128[3]  = _mm_srai_epi16(y128[3],3);
-    y128[4]  = _mm_srai_epi16(y128[4],3);
-    y128[5]  = _mm_srai_epi16(y128[5],3);
-    y128[6]  = _mm_srai_epi16(y128[6],3);
-    y128[7]  = _mm_srai_epi16(y128[7],3);
-    y128[8]  = _mm_srai_epi16(y128[8],3);
-    y128[9]  = _mm_srai_epi16(y128[9],3);
-    y128[10] = _mm_srai_epi16(y128[10],3);
-    y128[11] = _mm_srai_epi16(y128[11],3);
-    y128[12] = _mm_srai_epi16(y128[12],3);
-    y128[13] = _mm_srai_epi16(y128[13],3);
-    y128[14] = _mm_srai_epi16(y128[14],3);
-    y128[15] = _mm_srai_epi16(y128[15],3);
+    y128[0]  = shiftright_int16(y128[0],3);
+    y128[1]  = shiftright_int16(y128[1],3);
+    y128[2]  = shiftright_int16(y128[2],3);
+    y128[3]  = shiftright_int16(y128[3],3);
+    y128[4]  = shiftright_int16(y128[4],3);
+    y128[5]  = shiftright_int16(y128[5],3);
+    y128[6]  = shiftright_int16(y128[6],3);
+    y128[7]  = shiftright_int16(y128[7],3);
+    y128[8]  = shiftright_int16(y128[8],3);
+    y128[9]  = shiftright_int16(y128[9],3);
+    y128[10] = shiftright_int16(y128[10],3);
+    y128[11] = shiftright_int16(y128[11],3);
+    y128[12] = shiftright_int16(y128[12],3);
+    y128[13] = shiftright_int16(y128[13],3);
+    y128[14] = shiftright_int16(y128[14],3);
+    y128[15] = shiftright_int16(y128[15],3);
 
   }
 
@@ -1191,64 +1899,6 @@ void idft64(int16_t *x,int16_t *y,int scale)
 }
 
 
-/*
-void idft64(int16_t *x,int16_t *y,int scale) {
-
-  __m128i xtmp[16],ytmp[16],*tw64_128=(__m128i *)tw64,*x128=(__m128i *)x,*y128=(__m128i *)y;
-
-  transpose16_ooff(x128,xtmp,4);
-  transpose16_ooff(x128+4,xtmp+1,4);
-  transpose16_ooff(x128+8,xtmp+2,4);
-  transpose16_ooff(x128+12,xtmp+3,4);
-
-  idft16((int16_t*)(xtmp),(int16_t*)ytmp);
-  idft16((int16_t*)(xtmp+4),(int16_t*)(ytmp+4));
-  idft16((int16_t*)(xtmp+8),(int16_t*)(ytmp+8));
-  idft16((int16_t*)(xtmp+12),(int16_t*)(ytmp+12));
-
-
-  ibfly4(ytmp,ytmp+4,ytmp+8,ytmp+12,
-   y128,y128+4,y128+8,y128+12,
-   tw64_128,tw64_128+4,tw64_128+8);
-
-  ibfly4(ytmp+1,ytmp+5,ytmp+9,ytmp+13,
-   y128+1,y128+5,y128+9,y128+13,
-   tw64_128+1,tw64_128+5,tw64_128+9);
-
-  ibfly4(ytmp+2,ytmp+6,ytmp+10,ytmp+14,
-   y128+2,y128+6,y128+10,y128+14,
-   tw64_128+2,tw64_128+6,tw64_128+10);
-
-  ibfly4(ytmp+3,ytmp+7,ytmp+11,ytmp+15,
-   y128+3,y128+7,y128+11,y128+15,
-   tw64_128+3,tw64_128+7,tw64_128+11);
-
-  if (scale>0) {
-
-    y128[0]  = _mm_srai_epi16(y128[0],3);
-    y128[1]  = _mm_srai_epi16(y128[1],3);
-    y128[2]  = _mm_srai_epi16(y128[2],3);
-    y128[3]  = _mm_srai_epi16(y128[3],3);
-    y128[4]  = _mm_srai_epi16(y128[4],3);
-    y128[5]  = _mm_srai_epi16(y128[5],3);
-    y128[6]  = _mm_srai_epi16(y128[6],3);
-    y128[7]  = _mm_srai_epi16(y128[7],3);
-    y128[8]  = _mm_srai_epi16(y128[8],3);
-    y128[9]  = _mm_srai_epi16(y128[9],3);
-    y128[10] = _mm_srai_epi16(y128[10],3);
-    y128[11] = _mm_srai_epi16(y128[11],3);
-    y128[12] = _mm_srai_epi16(y128[12],3);
-    y128[13] = _mm_srai_epi16(y128[13],3);
-    y128[14] = _mm_srai_epi16(y128[14],3);
-    y128[15] = _mm_srai_epi16(y128[15],3);
-
-  }
-  _mm_empty();
-  _m_empty();
-
-}
-*/
-
 int16_t tw128[128] __attribute__((aligned(16))) = {  32767,0,32727,-1608,32609,-3212,32412,-4808,32137,-6393,31785,-7962,31356,-9512,30851,-11039,30272,-12540,29621,-14010,28897,-15447,28105,-16846,27244,-18205,26318,-19520,25329,-20788,24278,-22005,23169,-23170,22004,-24279,20787,-25330,19519,-26319,18204,-27245,16845,-28106,15446,-28898,14009,-29622,12539,-30273,11038,-30852,9511,-31357,7961,-31786,6392,-32138,4807,-32413,3211,-32610,1607,-32728,0,-32767,-1608,-32728,-3212,-32610,-4808,-32413,-6393,-32138,-7962,-31786,-9512,-31357,-11039,-30852,-12540,-30273,-14010,-29622,-15447,-28898,-16846,-28106,-18205,-27245,-19520,-26319,-20788,-25330,-22005,-24279,-23170,-23170,-24279,-22005,-25330,-20788,-26319,-19520,-27245,-18205,-28106,-16846,-28898,-15447,-29622,-14010,-30273,-12540,-30852,-11039,-31357,-9512,-31786,-7962,-32138,-6393,-32413,-4808,-32610,-3212,-32728,-1608};
 
 int16_t tw128a[128] __attribute__((aligned(16))) = { 32767,0,32727,1608,32609,3212,32412,4808,32137,6393,31785,7962,31356,9512,30851,11039,30272,12540,29621,14010,28897,15447,28105,16846,27244,18205,26318,19520,25329,20788,24278,22005,23169,23170,22004,24279,20787,25330,19519,26319,18204,27245,16845,28106,15446,28898,14009,29622,12539,30273,11038,30852,9511,31357,7961,31786,6392,32138,4807,32413,3211,32610,1607,32728,0,32767,-1608,32728,-3212,32610,-4808,32413,-6393,32138,-7962,31786,-9512,31357,-11039,30852,-12540,30273,-14010,29622,-15447,28898,-16846,28106,-18205,27245,-19520,26319,-20788,25330,-22005,24279,-23170,23170,-24279,22005,-25330,20788,-26319,19520,-27245,18205,-28106,16846,-28898,15447,-29622,14010,-30273,12540,-30852,11039,-31357,9512,-31786,7962,-32138,6393,-32413,4808,-32610,3212,-32728,1608};
@@ -1260,18 +1910,12 @@ int16_t tw128c[128] __attribute__((aligned(16))) = {0,32767,1608,32727,3212,3260
 void dft128(int16_t *x,int16_t *y,int scale)
 {
 
-  __m64 xtmp[64],*x64 = (__m64 *)x;
-  __m128i ytmp[32],*tw128a_128p=(__m128i *)tw128a,*tw128b_128p=(__m128i *)tw128b,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simdshort_q15_t xtmp[64],*x64 = (simdshort_q15_t *)x;
+  simd_q15_t ytmp[32],*tw128a_128p=(simd_q15_t *)tw128a,*tw128b_128p=(simd_q15_t *)tw128b,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i;
-  __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15);
+  simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15);
+
 
   transpose4_ooff(x64  ,xtmp,32);
   transpose4_ooff(x64+2,xtmp+1,32);
@@ -1323,71 +1967,39 @@ void dft128(int16_t *x,int16_t *y,int scale)
 
   if (scale>0) {
 
-    y128[0]  = _mm_mulhi_epi16(y128[0],ONE_OVER_SQRT2_Q15_128);
-    y128[0] = _mm_slli_epi16(y128[0],1);
-    y128[1]  = _mm_mulhi_epi16(y128[1],ONE_OVER_SQRT2_Q15_128);
-    y128[1] = _mm_slli_epi16(y128[1],1);
-    y128[2]  = _mm_mulhi_epi16(y128[2],ONE_OVER_SQRT2_Q15_128);
-    y128[2] = _mm_slli_epi16(y128[2],1);
-    y128[3]  = _mm_mulhi_epi16(y128[3],ONE_OVER_SQRT2_Q15_128);
-    y128[3] = _mm_slli_epi16(y128[3],1);
-    y128[4]  = _mm_mulhi_epi16(y128[4],ONE_OVER_SQRT2_Q15_128);
-    y128[4] = _mm_slli_epi16(y128[4],1);
-    y128[5]  = _mm_mulhi_epi16(y128[5],ONE_OVER_SQRT2_Q15_128);
-    y128[5] = _mm_slli_epi16(y128[5],1);
-    y128[6]  = _mm_mulhi_epi16(y128[6],ONE_OVER_SQRT2_Q15_128);
-    y128[6] = _mm_slli_epi16(y128[6],1);
-    y128[7]  = _mm_mulhi_epi16(y128[7],ONE_OVER_SQRT2_Q15_128);
-    y128[7] = _mm_slli_epi16(y128[7],1);
-    y128[8]  = _mm_mulhi_epi16(y128[8],ONE_OVER_SQRT2_Q15_128);
-    y128[8] = _mm_slli_epi16(y128[8],1);
-    y128[9]  = _mm_mulhi_epi16(y128[9],ONE_OVER_SQRT2_Q15_128);
-    y128[9] = _mm_slli_epi16(y128[9],1);
-    y128[10] = _mm_mulhi_epi16(y128[10],ONE_OVER_SQRT2_Q15_128);
-    y128[10] = _mm_slli_epi16(y128[10],1);
-    y128[11] = _mm_mulhi_epi16(y128[11],ONE_OVER_SQRT2_Q15_128);
-    y128[11] = _mm_slli_epi16(y128[11],1);
-    y128[12] = _mm_mulhi_epi16(y128[12],ONE_OVER_SQRT2_Q15_128);
-    y128[12] = _mm_slli_epi16(y128[12],1);
-    y128[13] = _mm_mulhi_epi16(y128[13],ONE_OVER_SQRT2_Q15_128);
-    y128[13] = _mm_slli_epi16(y128[13],1);
-    y128[14] = _mm_mulhi_epi16(y128[14],ONE_OVER_SQRT2_Q15_128);
-    y128[14] = _mm_slli_epi16(y128[14],1);
-    y128[15] = _mm_mulhi_epi16(y128[15],ONE_OVER_SQRT2_Q15_128);
-    y128[15] = _mm_slli_epi16(y128[15],1);
-
-    y128[16]  = _mm_mulhi_epi16(y128[16],ONE_OVER_SQRT2_Q15_128);
-    y128[16] = _mm_slli_epi16(y128[16],1);
-    y128[17]  = _mm_mulhi_epi16(y128[17],ONE_OVER_SQRT2_Q15_128);
-    y128[17] = _mm_slli_epi16(y128[17],1);
-    y128[18]  = _mm_mulhi_epi16(y128[18],ONE_OVER_SQRT2_Q15_128);
-    y128[18] = _mm_slli_epi16(y128[18],1);
-    y128[19]  = _mm_mulhi_epi16(y128[19],ONE_OVER_SQRT2_Q15_128);
-    y128[19] = _mm_slli_epi16(y128[19],1);
-    y128[20]  = _mm_mulhi_epi16(y128[20],ONE_OVER_SQRT2_Q15_128);
-    y128[20] = _mm_slli_epi16(y128[20],1);
-    y128[21]  = _mm_mulhi_epi16(y128[21],ONE_OVER_SQRT2_Q15_128);
-    y128[21] = _mm_slli_epi16(y128[21],1);
-    y128[22]  = _mm_mulhi_epi16(y128[22],ONE_OVER_SQRT2_Q15_128);
-    y128[22] = _mm_slli_epi16(y128[22],1);
-    y128[23]  = _mm_mulhi_epi16(y128[23],ONE_OVER_SQRT2_Q15_128);
-    y128[23] = _mm_slli_epi16(y128[23],1);
-    y128[24]  = _mm_mulhi_epi16(y128[24],ONE_OVER_SQRT2_Q15_128);
-    y128[24] = _mm_slli_epi16(y128[24],1);
-    y128[25]  = _mm_mulhi_epi16(y128[25],ONE_OVER_SQRT2_Q15_128);
-    y128[25] = _mm_slli_epi16(y128[25],1);
-    y128[26] = _mm_mulhi_epi16(y128[26],ONE_OVER_SQRT2_Q15_128);
-    y128[26] = _mm_slli_epi16(y128[26],1);
-    y128[27] = _mm_mulhi_epi16(y128[27],ONE_OVER_SQRT2_Q15_128);
-    y128[27] = _mm_slli_epi16(y128[27],1);
-    y128[28] = _mm_mulhi_epi16(y128[28],ONE_OVER_SQRT2_Q15_128);
-    y128[28] = _mm_slli_epi16(y128[28],1);
-    y128[29] = _mm_mulhi_epi16(y128[29],ONE_OVER_SQRT2_Q15_128);
-    y128[29] = _mm_slli_epi16(y128[29],1);
-    y128[30] = _mm_mulhi_epi16(y128[30],ONE_OVER_SQRT2_Q15_128);
-    y128[30] = _mm_slli_epi16(y128[30],1);
-    y128[31] = _mm_mulhi_epi16(y128[31],ONE_OVER_SQRT2_Q15_128);
-    y128[31] = _mm_slli_epi16(y128[31],1);
+    y128[0] = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128);
+    y128[1] = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128);
+    y128[2] = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128);
+    y128[3] = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128);
+    y128[4] = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128);
+    y128[5] = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128);
+    y128[6] = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128);
+    y128[7] = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128);
+    y128[8] = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128);
+    y128[9] = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128);
+    y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128);
+    y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128);
+    y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128);
+    y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128);
+    y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128);
+    y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128);
+    y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128);
+    y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128);
+    y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128);
+    y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128);
+    y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128);
+    y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128);
+    y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128);
+    y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128);
+    y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128);
+    y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128);
+    y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128);
+    y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128);
+    y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128);
+    y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128);
+    y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128);
+    y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128);
+
 
   }
 
@@ -1399,18 +2011,12 @@ void dft128(int16_t *x,int16_t *y,int scale)
 void idft128(int16_t *x,int16_t *y,int scale)
 {
 
-  __m64 xtmp[64],*x64 = (__m64 *)x;
-  __m128i ytmp[32],*tw128_128p=(__m128i *)tw128,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simdshort_q15_t xtmp[64],*x64 = (simdshort_q15_t *)x;
+  simd_q15_t ytmp[32],*tw128_128p=(simd_q15_t *)tw128,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i;
-  __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15);
+  simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15);
+
 
   transpose4_ooff(x64  ,xtmp,32);
   transpose4_ooff(x64+2,xtmp+1,32);
@@ -1460,71 +2066,38 @@ void idft128(int16_t *x,int16_t *y,int scale)
 
   if (scale>0) {
 
-    y128[0]  = _mm_mulhi_epi16(y128[0],ONE_OVER_SQRT2_Q15_128);
-    y128[0] = _mm_slli_epi16(y128[0],1);
-    y128[1]  = _mm_mulhi_epi16(y128[1],ONE_OVER_SQRT2_Q15_128);
-    y128[1] = _mm_slli_epi16(y128[1],1);
-    y128[2]  = _mm_mulhi_epi16(y128[2],ONE_OVER_SQRT2_Q15_128);
-    y128[2] = _mm_slli_epi16(y128[2],1);
-    y128[3]  = _mm_mulhi_epi16(y128[3],ONE_OVER_SQRT2_Q15_128);
-    y128[3] = _mm_slli_epi16(y128[3],1);
-    y128[4]  = _mm_mulhi_epi16(y128[4],ONE_OVER_SQRT2_Q15_128);
-    y128[4] = _mm_slli_epi16(y128[4],1);
-    y128[5]  = _mm_mulhi_epi16(y128[5],ONE_OVER_SQRT2_Q15_128);
-    y128[5] = _mm_slli_epi16(y128[5],1);
-    y128[6]  = _mm_mulhi_epi16(y128[6],ONE_OVER_SQRT2_Q15_128);
-    y128[6] = _mm_slli_epi16(y128[6],1);
-    y128[7]  = _mm_mulhi_epi16(y128[7],ONE_OVER_SQRT2_Q15_128);
-    y128[7] = _mm_slli_epi16(y128[7],1);
-    y128[8]  = _mm_mulhi_epi16(y128[8],ONE_OVER_SQRT2_Q15_128);
-    y128[8] = _mm_slli_epi16(y128[8],1);
-    y128[9]  = _mm_mulhi_epi16(y128[9],ONE_OVER_SQRT2_Q15_128);
-    y128[9] = _mm_slli_epi16(y128[9],1);
-    y128[10] = _mm_mulhi_epi16(y128[10],ONE_OVER_SQRT2_Q15_128);
-    y128[10] = _mm_slli_epi16(y128[10],1);
-    y128[11] = _mm_mulhi_epi16(y128[11],ONE_OVER_SQRT2_Q15_128);
-    y128[11] = _mm_slli_epi16(y128[11],1);
-    y128[12] = _mm_mulhi_epi16(y128[12],ONE_OVER_SQRT2_Q15_128);
-    y128[12] = _mm_slli_epi16(y128[12],1);
-    y128[13] = _mm_mulhi_epi16(y128[13],ONE_OVER_SQRT2_Q15_128);
-    y128[13] = _mm_slli_epi16(y128[13],1);
-    y128[14] = _mm_mulhi_epi16(y128[14],ONE_OVER_SQRT2_Q15_128);
-    y128[14] = _mm_slli_epi16(y128[14],1);
-    y128[15] = _mm_mulhi_epi16(y128[15],ONE_OVER_SQRT2_Q15_128);
-    y128[15] = _mm_slli_epi16(y128[15],1);
-
-    y128[16]  = _mm_mulhi_epi16(y128[16],ONE_OVER_SQRT2_Q15_128);
-    y128[16] = _mm_slli_epi16(y128[16],1);
-    y128[17]  = _mm_mulhi_epi16(y128[17],ONE_OVER_SQRT2_Q15_128);
-    y128[17] = _mm_slli_epi16(y128[17],1);
-    y128[18]  = _mm_mulhi_epi16(y128[18],ONE_OVER_SQRT2_Q15_128);
-    y128[18] = _mm_slli_epi16(y128[18],1);
-    y128[19]  = _mm_mulhi_epi16(y128[19],ONE_OVER_SQRT2_Q15_128);
-    y128[19] = _mm_slli_epi16(y128[19],1);
-    y128[20]  = _mm_mulhi_epi16(y128[20],ONE_OVER_SQRT2_Q15_128);
-    y128[20] = _mm_slli_epi16(y128[20],1);
-    y128[21]  = _mm_mulhi_epi16(y128[21],ONE_OVER_SQRT2_Q15_128);
-    y128[21] = _mm_slli_epi16(y128[21],1);
-    y128[22]  = _mm_mulhi_epi16(y128[22],ONE_OVER_SQRT2_Q15_128);
-    y128[22] = _mm_slli_epi16(y128[22],1);
-    y128[23]  = _mm_mulhi_epi16(y128[23],ONE_OVER_SQRT2_Q15_128);
-    y128[23] = _mm_slli_epi16(y128[23],1);
-    y128[24]  = _mm_mulhi_epi16(y128[24],ONE_OVER_SQRT2_Q15_128);
-    y128[24] = _mm_slli_epi16(y128[24],1);
-    y128[25]  = _mm_mulhi_epi16(y128[25],ONE_OVER_SQRT2_Q15_128);
-    y128[25] = _mm_slli_epi16(y128[25],1);
-    y128[26] = _mm_mulhi_epi16(y128[26],ONE_OVER_SQRT2_Q15_128);
-    y128[26] = _mm_slli_epi16(y128[26],1);
-    y128[27] = _mm_mulhi_epi16(y128[27],ONE_OVER_SQRT2_Q15_128);
-    y128[27] = _mm_slli_epi16(y128[27],1);
-    y128[28] = _mm_mulhi_epi16(y128[28],ONE_OVER_SQRT2_Q15_128);
-    y128[28] = _mm_slli_epi16(y128[28],1);
-    y128[29] = _mm_mulhi_epi16(y128[29],ONE_OVER_SQRT2_Q15_128);
-    y128[29] = _mm_slli_epi16(y128[29],1);
-    y128[30] = _mm_mulhi_epi16(y128[30],ONE_OVER_SQRT2_Q15_128);
-    y128[30] = _mm_slli_epi16(y128[30],1);
-    y128[31] = _mm_mulhi_epi16(y128[31],ONE_OVER_SQRT2_Q15_128);
-    y128[31] = _mm_slli_epi16(y128[31],1);
+    y128[0]  = mulhi_int16(y128[0],ONE_OVER_SQRT2_Q15_128);
+    y128[1]  = mulhi_int16(y128[1],ONE_OVER_SQRT2_Q15_128);
+    y128[2]  = mulhi_int16(y128[2],ONE_OVER_SQRT2_Q15_128);
+    y128[3]  = mulhi_int16(y128[3],ONE_OVER_SQRT2_Q15_128);
+    y128[4]  = mulhi_int16(y128[4],ONE_OVER_SQRT2_Q15_128);
+    y128[5]  = mulhi_int16(y128[5],ONE_OVER_SQRT2_Q15_128);
+    y128[6]  = mulhi_int16(y128[6],ONE_OVER_SQRT2_Q15_128);
+    y128[7]  = mulhi_int16(y128[7],ONE_OVER_SQRT2_Q15_128);
+    y128[8]  = mulhi_int16(y128[8],ONE_OVER_SQRT2_Q15_128);
+    y128[9]  = mulhi_int16(y128[9],ONE_OVER_SQRT2_Q15_128);
+    y128[10] = mulhi_int16(y128[10],ONE_OVER_SQRT2_Q15_128);
+    y128[11] = mulhi_int16(y128[11],ONE_OVER_SQRT2_Q15_128);
+    y128[12] = mulhi_int16(y128[12],ONE_OVER_SQRT2_Q15_128);
+    y128[13] = mulhi_int16(y128[13],ONE_OVER_SQRT2_Q15_128);
+    y128[14] = mulhi_int16(y128[14],ONE_OVER_SQRT2_Q15_128);
+    y128[15] = mulhi_int16(y128[15],ONE_OVER_SQRT2_Q15_128);
+    y128[16] = mulhi_int16(y128[16],ONE_OVER_SQRT2_Q15_128);
+    y128[17] = mulhi_int16(y128[17],ONE_OVER_SQRT2_Q15_128);
+    y128[18] = mulhi_int16(y128[18],ONE_OVER_SQRT2_Q15_128);
+    y128[19] = mulhi_int16(y128[19],ONE_OVER_SQRT2_Q15_128);
+    y128[20] = mulhi_int16(y128[20],ONE_OVER_SQRT2_Q15_128);
+    y128[21] = mulhi_int16(y128[21],ONE_OVER_SQRT2_Q15_128);
+    y128[22] = mulhi_int16(y128[22],ONE_OVER_SQRT2_Q15_128);
+    y128[23] = mulhi_int16(y128[23],ONE_OVER_SQRT2_Q15_128);
+    y128[24] = mulhi_int16(y128[24],ONE_OVER_SQRT2_Q15_128);
+    y128[25] = mulhi_int16(y128[25],ONE_OVER_SQRT2_Q15_128);
+    y128[26] = mulhi_int16(y128[26],ONE_OVER_SQRT2_Q15_128);
+    y128[27] = mulhi_int16(y128[27],ONE_OVER_SQRT2_Q15_128);
+    y128[28] = mulhi_int16(y128[28],ONE_OVER_SQRT2_Q15_128);
+    y128[29] = mulhi_int16(y128[29],ONE_OVER_SQRT2_Q15_128);
+    y128[30] = mulhi_int16(y128[30],ONE_OVER_SQRT2_Q15_128);
+    y128[31] = mulhi_int16(y128[31],ONE_OVER_SQRT2_Q15_128);
 
   }
 
@@ -1552,8 +2125,8 @@ int16_t tw256b[384] __attribute__((aligned(16))) = {0,32767,-805,32757,-1608,327
 void dft256(int16_t *x,int16_t *y,int scale)
 {
 
-  __m128i xtmp[64],ytmp[64],*tw256a_128p=(__m128i *)tw256a,*tw256b_128p=(__m128i *)tw256b,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simd_q15_t xtmp[64],ytmp[64],*tw256a_128p=(simd_q15_t *)tw256a,*tw256b_128p=(simd_q15_t *)tw256b,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i;
 
 #ifdef D256STATS
@@ -1632,22 +2205,22 @@ void dft256(int16_t *x,int16_t *y,int scale)
   if (scale>0) {
 
     for (i=0; i<4; i++) {
-      y128[0]  = _mm_srai_epi16(y128[0],1);
-      y128[1]  = _mm_srai_epi16(y128[1],1);
-      y128[2]  = _mm_srai_epi16(y128[2],1);
-      y128[3]  = _mm_srai_epi16(y128[3],1);
-      y128[4]  = _mm_srai_epi16(y128[4],1);
-      y128[5]  = _mm_srai_epi16(y128[5],1);
-      y128[6]  = _mm_srai_epi16(y128[6],1);
-      y128[7]  = _mm_srai_epi16(y128[7],1);
-      y128[8]  = _mm_srai_epi16(y128[8],1);
-      y128[9]  = _mm_srai_epi16(y128[9],1);
-      y128[10] = _mm_srai_epi16(y128[10],1);
-      y128[11] = _mm_srai_epi16(y128[11],1);
-      y128[12] = _mm_srai_epi16(y128[12],1);
-      y128[13] = _mm_srai_epi16(y128[13],1);
-      y128[14] = _mm_srai_epi16(y128[14],1);
-      y128[15] = _mm_srai_epi16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],1);
+      y128[1]  = shiftright_int16(y128[1],1);
+      y128[2]  = shiftright_int16(y128[2],1);
+      y128[3]  = shiftright_int16(y128[3],1);
+      y128[4]  = shiftright_int16(y128[4],1);
+      y128[5]  = shiftright_int16(y128[5],1);
+      y128[6]  = shiftright_int16(y128[6],1);
+      y128[7]  = shiftright_int16(y128[7],1);
+      y128[8]  = shiftright_int16(y128[8],1);
+      y128[9]  = shiftright_int16(y128[9],1);
+      y128[10] = shiftright_int16(y128[10],1);
+      y128[11] = shiftright_int16(y128[11],1);
+      y128[12] = shiftright_int16(y128[12],1);
+      y128[13] = shiftright_int16(y128[13],1);
+      y128[14] = shiftright_int16(y128[14],1);
+      y128[15] = shiftright_int16(y128[15],1);
 
       y128+=16;
     }
@@ -1664,8 +2237,8 @@ void dft256(int16_t *x,int16_t *y,int scale)
 void idft256(int16_t *x,int16_t *y,int scale)
 {
 
-  __m128i xtmp[64],ytmp[64],*tw256_128p=(__m128i *)tw256,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simd_q15_t xtmp[64],ytmp[64],*tw256_128p=(simd_q15_t *)tw256,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i,j;
 
   for (i=0,j=0; i<64; i+=4,j++) {
@@ -1690,22 +2263,22 @@ void idft256(int16_t *x,int16_t *y,int scale)
   if (scale>0) {
 
     for (i=0; i<4; i++) {
-      y128[0]  = _mm_srai_epi16(y128[0],1);
-      y128[1]  = _mm_srai_epi16(y128[1],1);
-      y128[2]  = _mm_srai_epi16(y128[2],1);
-      y128[3]  = _mm_srai_epi16(y128[3],1);
-      y128[4]  = _mm_srai_epi16(y128[4],1);
-      y128[5]  = _mm_srai_epi16(y128[5],1);
-      y128[6]  = _mm_srai_epi16(y128[6],1);
-      y128[7]  = _mm_srai_epi16(y128[7],1);
-      y128[8]  = _mm_srai_epi16(y128[8],1);
-      y128[9]  = _mm_srai_epi16(y128[9],1);
-      y128[10] = _mm_srai_epi16(y128[10],1);
-      y128[11] = _mm_srai_epi16(y128[11],1);
-      y128[12] = _mm_srai_epi16(y128[12],1);
-      y128[13] = _mm_srai_epi16(y128[13],1);
-      y128[14] = _mm_srai_epi16(y128[14],1);
-      y128[15] = _mm_srai_epi16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],1);
+      y128[1]  = shiftright_int16(y128[1],1);
+      y128[2]  = shiftright_int16(y128[2],1);
+      y128[3]  = shiftright_int16(y128[3],1);
+      y128[4]  = shiftright_int16(y128[4],1);
+      y128[5]  = shiftright_int16(y128[5],1);
+      y128[6]  = shiftright_int16(y128[6],1);
+      y128[7]  = shiftright_int16(y128[7],1);
+      y128[8]  = shiftright_int16(y128[8],1);
+      y128[9]  = shiftright_int16(y128[9],1);
+      y128[10] = shiftright_int16(y128[10],1);
+      y128[11] = shiftright_int16(y128[11],1);
+      y128[12] = shiftright_int16(y128[12],1);
+      y128[13] = shiftright_int16(y128[13],1);
+      y128[14] = shiftright_int16(y128[14],1);
+      y128[15] = shiftright_int16(y128[15],1);
 
       y128+=16;
     }
@@ -1739,18 +2312,11 @@ int16_t tw512c[512] __attribute__((aligned(16))) = {
 void dft512(int16_t *x,int16_t *y,int scale)
 {
 
-  __m64 xtmp[256],*xtmpp,*x64 = (__m64 *)x;
-  __m128i ytmp[128],*tw512a_128p=(__m128i *)tw512a,*tw512b_128p=(__m128i *)tw512b,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simdshort_q15_t xtmp[256],*xtmpp,*x64 = (simdshort_q15_t *)x;
+  simd_q15_t ytmp[128],*tw512a_128p=(simd_q15_t *)tw512a,*tw512b_128p=(simd_q15_t *)tw512b,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i;
-  __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15);
+  simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15);
 
   xtmpp = xtmp;
 
@@ -1838,38 +2404,22 @@ void dft512(int16_t *x,int16_t *y,int scale)
     y128p = y128;
 
     for (i=0; i<8; i++) {
-      y128p[0]  = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[0] = _mm_slli_epi16(y128p[0],1);
-      y128p[1]  = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[1] = _mm_slli_epi16(y128p[1],1);
-      y128p[2]  = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[2] = _mm_slli_epi16(y128p[2],1);
-      y128p[3]  = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[3] = _mm_slli_epi16(y128p[3],1);
-      y128p[4]  = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[4] = _mm_slli_epi16(y128p[4],1);
-      y128p[5]  = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[5] = _mm_slli_epi16(y128p[5],1);
-      y128p[6]  = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[6] = _mm_slli_epi16(y128p[6],1);
-      y128p[7]  = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[7] = _mm_slli_epi16(y128p[7],1);
-      y128p[8]  = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[8] = _mm_slli_epi16(y128p[8],1);
-      y128p[9]  = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[9] = _mm_slli_epi16(y128p[9],1);
-      y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = _mm_slli_epi16(y128p[10],1);
-      y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = _mm_slli_epi16(y128p[11],1);
-      y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = _mm_slli_epi16(y128p[12],1);
-      y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = _mm_slli_epi16(y128p[13],1);
-      y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = _mm_slli_epi16(y128p[14],1);
-      y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = _mm_slli_epi16(y128p[15],1);
+      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
       y128p+=16;
     }
   }
@@ -1882,18 +2432,11 @@ void dft512(int16_t *x,int16_t *y,int scale)
 void idft512(int16_t *x,int16_t *y,int scale)
 {
 
-  __m64 xtmp[256],*xtmpp,*x64 = (__m64 *)x;
-  __m128i ytmp[128],*tw512_128p=(__m128i *)tw512,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simdshort_q15_t xtmp[256],*xtmpp,*x64 = (simdshort_q15_t *)x;
+  simd_q15_t ytmp[128],*tw512_128p=(simd_q15_t *)tw512,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i;
-  __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15);
+  simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15);
 
   xtmpp = xtmp;
 
@@ -1951,38 +2494,22 @@ void idft512(int16_t *x,int16_t *y,int scale)
     y128p = y128;
 
     for (i=0; i<8; i++) {
-      y128p[0]  = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[0] = _mm_slli_epi16(y128p[0],1);
-      y128p[1]  = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[1] = _mm_slli_epi16(y128p[1],1);
-      y128p[2]  = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[2] = _mm_slli_epi16(y128p[2],1);
-      y128p[3]  = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[3] = _mm_slli_epi16(y128p[3],1);
-      y128p[4]  = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[4] = _mm_slli_epi16(y128p[4],1);
-      y128p[5]  = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[5] = _mm_slli_epi16(y128p[5],1);
-      y128p[6]  = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[6] = _mm_slli_epi16(y128p[6],1);
-      y128p[7]  = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[7] = _mm_slli_epi16(y128p[7],1);
-      y128p[8]  = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[8] = _mm_slli_epi16(y128p[8],1);
-      y128p[9]  = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[9] = _mm_slli_epi16(y128p[9],1);
-      y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = _mm_slli_epi16(y128p[10],1);
-      y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = _mm_slli_epi16(y128p[11],1);
-      y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = _mm_slli_epi16(y128p[12],1);
-      y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = _mm_slli_epi16(y128p[13],1);
-      y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = _mm_slli_epi16(y128p[14],1);
-      y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = _mm_slli_epi16(y128p[15],1);
+      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
       y128p+=16;
     }
   }
@@ -2000,8 +2527,8 @@ int16_t tw1024[1536] __attribute__((aligned(16))) = {  32767,0,32766,-202,32764,
 void dft1024(int16_t *x,int16_t *y,int scale)
 {
 
-  __m128i xtmp[256],ytmp[256],*tw1024_128p=(__m128i *)tw1024,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simd_q15_t xtmp[256],ytmp[256],*tw1024_128p=(simd_q15_t *)tw1024,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i,j;
 
   for (i=0,j=0; i<256; i+=4,j++) {
@@ -2026,22 +2553,22 @@ void dft1024(int16_t *x,int16_t *y,int scale)
   if (scale>0) {
 
     for (i=0; i<16; i++) {
-      y128[0]  = _mm_srai_epi16(y128[0],1);
-      y128[1]  = _mm_srai_epi16(y128[1],1);
-      y128[2]  = _mm_srai_epi16(y128[2],1);
-      y128[3]  = _mm_srai_epi16(y128[3],1);
-      y128[4]  = _mm_srai_epi16(y128[4],1);
-      y128[5]  = _mm_srai_epi16(y128[5],1);
-      y128[6]  = _mm_srai_epi16(y128[6],1);
-      y128[7]  = _mm_srai_epi16(y128[7],1);
-      y128[8]  = _mm_srai_epi16(y128[8],1);
-      y128[9]  = _mm_srai_epi16(y128[9],1);
-      y128[10] = _mm_srai_epi16(y128[10],1);
-      y128[11] = _mm_srai_epi16(y128[11],1);
-      y128[12] = _mm_srai_epi16(y128[12],1);
-      y128[13] = _mm_srai_epi16(y128[13],1);
-      y128[14] = _mm_srai_epi16(y128[14],1);
-      y128[15] = _mm_srai_epi16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],1);
+      y128[1]  = shiftright_int16(y128[1],1);
+      y128[2]  = shiftright_int16(y128[2],1);
+      y128[3]  = shiftright_int16(y128[3],1);
+      y128[4]  = shiftright_int16(y128[4],1);
+      y128[5]  = shiftright_int16(y128[5],1);
+      y128[6]  = shiftright_int16(y128[6],1);
+      y128[7]  = shiftright_int16(y128[7],1);
+      y128[8]  = shiftright_int16(y128[8],1);
+      y128[9]  = shiftright_int16(y128[9],1);
+      y128[10] = shiftright_int16(y128[10],1);
+      y128[11] = shiftright_int16(y128[11],1);
+      y128[12] = shiftright_int16(y128[12],1);
+      y128[13] = shiftright_int16(y128[13],1);
+      y128[14] = shiftright_int16(y128[14],1);
+      y128[15] = shiftright_int16(y128[15],1);
 
       y128+=16;
     }
@@ -2056,8 +2583,8 @@ void dft1024(int16_t *x,int16_t *y,int scale)
 void idft1024(int16_t *x,int16_t *y,int scale)
 {
 
-  __m128i xtmp[256],ytmp[256],*tw1024_128p=(__m128i *)tw1024,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simd_q15_t xtmp[256],ytmp[256],*tw1024_128p=(simd_q15_t *)tw1024,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i,j;
 
   for (i=0,j=0; i<256; i+=4,j++) {
@@ -2082,22 +2609,22 @@ void idft1024(int16_t *x,int16_t *y,int scale)
   if (scale>0) {
 
     for (i=0; i<16; i++) {
-      y128[0]  = _mm_srai_epi16(y128[0],1);
-      y128[1]  = _mm_srai_epi16(y128[1],1);
-      y128[2]  = _mm_srai_epi16(y128[2],1);
-      y128[3]  = _mm_srai_epi16(y128[3],1);
-      y128[4]  = _mm_srai_epi16(y128[4],1);
-      y128[5]  = _mm_srai_epi16(y128[5],1);
-      y128[6]  = _mm_srai_epi16(y128[6],1);
-      y128[7]  = _mm_srai_epi16(y128[7],1);
-      y128[8]  = _mm_srai_epi16(y128[8],1);
-      y128[9]  = _mm_srai_epi16(y128[9],1);
-      y128[10] = _mm_srai_epi16(y128[10],1);
-      y128[11] = _mm_srai_epi16(y128[11],1);
-      y128[12] = _mm_srai_epi16(y128[12],1);
-      y128[13] = _mm_srai_epi16(y128[13],1);
-      y128[14] = _mm_srai_epi16(y128[14],1);
-      y128[15] = _mm_srai_epi16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],1);
+      y128[1]  = shiftright_int16(y128[1],1);
+      y128[2]  = shiftright_int16(y128[2],1);
+      y128[3]  = shiftright_int16(y128[3],1);
+      y128[4]  = shiftright_int16(y128[4],1);
+      y128[5]  = shiftright_int16(y128[5],1);
+      y128[6]  = shiftright_int16(y128[6],1);
+      y128[7]  = shiftright_int16(y128[7],1);
+      y128[8]  = shiftright_int16(y128[8],1);
+      y128[9]  = shiftright_int16(y128[9],1);
+      y128[10] = shiftright_int16(y128[10],1);
+      y128[11] = shiftright_int16(y128[11],1);
+      y128[12] = shiftright_int16(y128[12],1);
+      y128[13] = shiftright_int16(y128[13],1);
+      y128[14] = shiftright_int16(y128[14],1);
+      y128[15] = shiftright_int16(y128[15],1);
 
       y128+=16;
     }
@@ -2115,18 +2642,11 @@ int16_t tw2048[2048] __attribute__((aligned(16))) = {32767,0,32766,-101,32766,-2
 void dft2048(int16_t *x,int16_t *y,int scale)
 {
 
-  __m64 xtmp[2048],*xtmpp,*x64 = (__m64 *)x;
-  __m128i ytmp[512],*tw2048_128p=(__m128i *)tw2048,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simdshort_q15_t xtmp[2048],*xtmpp,*x64 = (simdshort_q15_t *)x;
+  simd_q15_t ytmp[512],*tw2048_128p=(simd_q15_t *)tw2048,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i;
-  __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15);
+  simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15);
 
   xtmpp = xtmp;
 
@@ -2184,38 +2704,22 @@ void dft2048(int16_t *x,int16_t *y,int scale)
     y128p = y128;
 
     for (i=0; i<32; i++) {
-      y128p[0]  = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[0] = _mm_slli_epi16(y128p[0],1);
-      y128p[1]  = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[1] = _mm_slli_epi16(y128p[1],1);
-      y128p[2]  = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[2] = _mm_slli_epi16(y128p[2],1);
-      y128p[3]  = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[3] = _mm_slli_epi16(y128p[3],1);
-      y128p[4]  = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[4] = _mm_slli_epi16(y128p[4],1);
-      y128p[5]  = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[5] = _mm_slli_epi16(y128p[5],1);
-      y128p[6]  = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[6] = _mm_slli_epi16(y128p[6],1);
-      y128p[7]  = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[7] = _mm_slli_epi16(y128p[7],1);
-      y128p[8]  = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[8] = _mm_slli_epi16(y128p[8],1);
-      y128p[9]  = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[9] = _mm_slli_epi16(y128p[9],1);
-      y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = _mm_slli_epi16(y128p[10],1);
-      y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = _mm_slli_epi16(y128p[11],1);
-      y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = _mm_slli_epi16(y128p[12],1);
-      y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = _mm_slli_epi16(y128p[13],1);
-      y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = _mm_slli_epi16(y128p[14],1);
-      y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = _mm_slli_epi16(y128p[15],1);
+      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
       y128p+=16;
     }
   }
@@ -2228,18 +2732,11 @@ void dft2048(int16_t *x,int16_t *y,int scale)
 void idft2048(int16_t *x,int16_t *y,int scale)
 {
 
-  __m64 xtmp[2048],*xtmpp,*x64 = (__m64 *)x;
-  __m128i ytmp[512],*tw2048_128p=(__m128i *)tw2048,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simdshort_q15_t xtmp[2048],*xtmpp,*x64 = (simdshort_q15_t *)x;
+  simd_q15_t ytmp[512],*tw2048_128p=(simd_q15_t *)tw2048,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i;
-  __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15,
-                                   ONE_OVER_SQRT2_Q15);
+  simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15);
 
   xtmpp = xtmp;
 
@@ -2297,38 +2794,22 @@ void idft2048(int16_t *x,int16_t *y,int scale)
     y128p = y128;
 
     for (i=0; i<32; i++) {
-      y128p[0]  = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[0] = _mm_slli_epi16(y128p[0],1);
-      y128p[1]  = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[1] = _mm_slli_epi16(y128p[1],1);
-      y128p[2]  = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[2] = _mm_slli_epi16(y128p[2],1);
-      y128p[3]  = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[3] = _mm_slli_epi16(y128p[3],1);
-      y128p[4]  = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[4] = _mm_slli_epi16(y128p[4],1);
-      y128p[5]  = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[5] = _mm_slli_epi16(y128p[5],1);
-      y128p[6]  = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[6] = _mm_slli_epi16(y128p[6],1);
-      y128p[7]  = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[7] = _mm_slli_epi16(y128p[7],1);
-      y128p[8]  = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[8] = _mm_slli_epi16(y128p[8],1);
-      y128p[9]  = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[9] = _mm_slli_epi16(y128p[9],1);
-      y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = _mm_slli_epi16(y128p[10],1);
-      y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = _mm_slli_epi16(y128p[11],1);
-      y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = _mm_slli_epi16(y128p[12],1);
-      y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = _mm_slli_epi16(y128p[13],1);
-      y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = _mm_slli_epi16(y128p[14],1);
-      y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = _mm_slli_epi16(y128p[15],1);
+      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
       y128p+=16;
     }
   }
@@ -2343,8 +2824,8 @@ void idft2048(int16_t *x,int16_t *y,int scale)
 void dft4096(int16_t *x,int16_t *y,int scale)
 {
 
-  __m128i xtmp[4096],ytmp[4096],*tw4096_128p=(__m128i *)tw4096,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simd_q15_t xtmp[4096],ytmp[4096],*tw4096_128p=(simd_q15_t *)tw4096,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i,j;
 
   for (i=0,j=0; i<1024; i+=4,j++) {
@@ -2369,22 +2850,22 @@ void dft4096(int16_t *x,int16_t *y,int scale)
   if (scale>0) {
 
     for (i=0; i<64; i++) {
-      y128[0]  = _mm_srai_epi16(y128[0],1);
-      y128[1]  = _mm_srai_epi16(y128[1],1);
-      y128[2]  = _mm_srai_epi16(y128[2],1);
-      y128[3]  = _mm_srai_epi16(y128[3],1);
-      y128[4]  = _mm_srai_epi16(y128[4],1);
-      y128[5]  = _mm_srai_epi16(y128[5],1);
-      y128[6]  = _mm_srai_epi16(y128[6],1);
-      y128[7]  = _mm_srai_epi16(y128[7],1);
-      y128[8]  = _mm_srai_epi16(y128[8],1);
-      y128[9]  = _mm_srai_epi16(y128[9],1);
-      y128[10] = _mm_srai_epi16(y128[10],1);
-      y128[11] = _mm_srai_epi16(y128[11],1);
-      y128[12] = _mm_srai_epi16(y128[12],1);
-      y128[13] = _mm_srai_epi16(y128[13],1);
-      y128[14] = _mm_srai_epi16(y128[14],1);
-      y128[15] = _mm_srai_epi16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],1);
+      y128[1]  = shiftright_int16(y128[1],1);
+      y128[2]  = shiftright_int16(y128[2],1);
+      y128[3]  = shiftright_int16(y128[3],1);
+      y128[4]  = shiftright_int16(y128[4],1);
+      y128[5]  = shiftright_int16(y128[5],1);
+      y128[6]  = shiftright_int16(y128[6],1);
+      y128[7]  = shiftright_int16(y128[7],1);
+      y128[8]  = shiftright_int16(y128[8],1);
+      y128[9]  = shiftright_int16(y128[9],1);
+      y128[10] = shiftright_int16(y128[10],1);
+      y128[11] = shiftright_int16(y128[11],1);
+      y128[12] = shiftright_int16(y128[12],1);
+      y128[13] = shiftright_int16(y128[13],1);
+      y128[14] = shiftright_int16(y128[14],1);
+      y128[15] = shiftright_int16(y128[15],1);
 
       y128+=16;
     }
@@ -2399,8 +2880,8 @@ void dft4096(int16_t *x,int16_t *y,int scale)
 void idft4096(int16_t *x,int16_t *y,int scale)
 {
 
-  __m128i xtmp[4096],ytmp[4096],*tw4096_128p=(__m128i *)tw4096,*x128=(__m128i *)x,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simd_q15_t xtmp[4096],ytmp[4096],*tw4096_128p=(simd_q15_t *)tw4096,*x128=(simd_q15_t *)x,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i,j;
 
   for (i=0,j=0; i<1024; i+=4,j++) {
@@ -2425,22 +2906,22 @@ void idft4096(int16_t *x,int16_t *y,int scale)
   if (scale>0) {
 
     for (i=0; i<64; i++) {
-      y128[0]  = _mm_srai_epi16(y128[0],1);
-      y128[1]  = _mm_srai_epi16(y128[1],1);
-      y128[2]  = _mm_srai_epi16(y128[2],1);
-      y128[3]  = _mm_srai_epi16(y128[3],1);
-      y128[4]  = _mm_srai_epi16(y128[4],1);
-      y128[5]  = _mm_srai_epi16(y128[5],1);
-      y128[6]  = _mm_srai_epi16(y128[6],1);
-      y128[7]  = _mm_srai_epi16(y128[7],1);
-      y128[8]  = _mm_srai_epi16(y128[8],1);
-      y128[9]  = _mm_srai_epi16(y128[9],1);
-      y128[10] = _mm_srai_epi16(y128[10],1);
-      y128[11] = _mm_srai_epi16(y128[11],1);
-      y128[12] = _mm_srai_epi16(y128[12],1);
-      y128[13] = _mm_srai_epi16(y128[13],1);
-      y128[14] = _mm_srai_epi16(y128[14],1);
-      y128[15] = _mm_srai_epi16(y128[15],1);
+      y128[0]  = shiftright_int16(y128[0],1);
+      y128[1]  = shiftright_int16(y128[1],1);
+      y128[2]  = shiftright_int16(y128[2],1);
+      y128[3]  = shiftright_int16(y128[3],1);
+      y128[4]  = shiftright_int16(y128[4],1);
+      y128[5]  = shiftright_int16(y128[5],1);
+      y128[6]  = shiftright_int16(y128[6],1);
+      y128[7]  = shiftright_int16(y128[7],1);
+      y128[8]  = shiftright_int16(y128[8],1);
+      y128[9]  = shiftright_int16(y128[9],1);
+      y128[10] = shiftright_int16(y128[10],1);
+      y128[11] = shiftright_int16(y128[11],1);
+      y128[12] = shiftright_int16(y128[12],1);
+      y128[13] = shiftright_int16(y128[13],1);
+      y128[14] = shiftright_int16(y128[14],1);
+      y128[15] = shiftright_int16(y128[15],1);
 
       y128+=16;
     }
@@ -2468,18 +2949,11 @@ static int16_t tw8192[4096*2] = {32767,0,32766,-26,32766,-51,32766,-76,32766,-10
 void dft8192(int16_t *x,int16_t *y,int scale)
 {
 
-  __m64 xtmp[4096],*xtmpp,*x64 = (__m64 *)x;
-  __m128i ytmp[1024],*tw8192_128p=(__m128i *)tw8192,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simdshort_q15_t xtmp[4096],*xtmpp,*x64 = (simdshort_q15_t *)x;
+  simd_q15_t ytmp[1024],*tw8192_128p=(simd_q15_t *)tw8192,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i;
-  __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15);
+  simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15);
   
   xtmpp = xtmp;
 
@@ -2537,38 +3011,22 @@ void dft8192(int16_t *x,int16_t *y,int scale)
     y128p = y128;
 
     for (i=0; i<128; i++) {
-      y128p[0]  = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[0] = _mm_slli_epi16(y128p[0],1);
-      y128p[1]  = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[1] = _mm_slli_epi16(y128p[1],1);
-      y128p[2]  = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[2] = _mm_slli_epi16(y128p[2],1);
-      y128p[3]  = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[3] = _mm_slli_epi16(y128p[3],1);
-      y128p[4]  = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[4] = _mm_slli_epi16(y128p[4],1);
-      y128p[5]  = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[5] = _mm_slli_epi16(y128p[5],1);
-      y128p[6]  = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[6] = _mm_slli_epi16(y128p[6],1);
-      y128p[7]  = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[7] = _mm_slli_epi16(y128p[7],1);
-      y128p[8]  = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[8] = _mm_slli_epi16(y128p[8],1);
-      y128p[9]  = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[9] = _mm_slli_epi16(y128p[9],1);
-      y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = _mm_slli_epi16(y128p[10],1);
-      y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = _mm_slli_epi16(y128p[11],1);
-      y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = _mm_slli_epi16(y128p[12],1);
-      y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = _mm_slli_epi16(y128p[13],1);
-      y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = _mm_slli_epi16(y128p[14],1);
-      y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = _mm_slli_epi16(y128p[15],1);
+      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
       y128p+=16;
     }
   }
@@ -2581,18 +3039,11 @@ void dft8192(int16_t *x,int16_t *y,int scale)
 void idft8192(int16_t *x,int16_t *y,int scale)
 {
 
-  __m64 xtmp[4096],*xtmpp,*x64 = (__m64 *)x;
-  __m128i ytmp[2048],*tw8192_128p=(__m128i *)tw8192,*y128=(__m128i *)y,*y128p=(__m128i *)y;
-  __m128i *ytmpp = &ytmp[0];
+  simdshort_q15_t xtmp[4096],*xtmpp,*x64 = (simdshort_q15_t *)x;
+  simd_q15_t ytmp[2048],*tw8192_128p=(simd_q15_t *)tw8192,*y128=(simd_q15_t *)y,*y128p=(simd_q15_t *)y;
+  simd_q15_t *ytmpp = &ytmp[0];
   int i;
-  __m128i ONE_OVER_SQRT2_Q15_128 = _mm_set_epi16(ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15,
-						 ONE_OVER_SQRT2_Q15);
+  simd_q15_t ONE_OVER_SQRT2_Q15_128 = set1_int16(ONE_OVER_SQRT2_Q15);
   
   xtmpp = xtmp;
 
@@ -2650,38 +3101,22 @@ void idft8192(int16_t *x,int16_t *y,int scale)
     y128p = y128;
 
     for (i=0; i<128; i++) {
-      y128p[0]  = _mm_mulhi_epi16(y128p[0],ONE_OVER_SQRT2_Q15_128);
-      y128p[0] = _mm_slli_epi16(y128p[0],1);
-      y128p[1]  = _mm_mulhi_epi16(y128p[1],ONE_OVER_SQRT2_Q15_128);
-      y128p[1] = _mm_slli_epi16(y128p[1],1);
-      y128p[2]  = _mm_mulhi_epi16(y128p[2],ONE_OVER_SQRT2_Q15_128);
-      y128p[2] = _mm_slli_epi16(y128p[2],1);
-      y128p[3]  = _mm_mulhi_epi16(y128p[3],ONE_OVER_SQRT2_Q15_128);
-      y128p[3] = _mm_slli_epi16(y128p[3],1);
-      y128p[4]  = _mm_mulhi_epi16(y128p[4],ONE_OVER_SQRT2_Q15_128);
-      y128p[4] = _mm_slli_epi16(y128p[4],1);
-      y128p[5]  = _mm_mulhi_epi16(y128p[5],ONE_OVER_SQRT2_Q15_128);
-      y128p[5] = _mm_slli_epi16(y128p[5],1);
-      y128p[6]  = _mm_mulhi_epi16(y128p[6],ONE_OVER_SQRT2_Q15_128);
-      y128p[6] = _mm_slli_epi16(y128p[6],1);
-      y128p[7]  = _mm_mulhi_epi16(y128p[7],ONE_OVER_SQRT2_Q15_128);
-      y128p[7] = _mm_slli_epi16(y128p[7],1);
-      y128p[8]  = _mm_mulhi_epi16(y128p[8],ONE_OVER_SQRT2_Q15_128);
-      y128p[8] = _mm_slli_epi16(y128p[8],1);
-      y128p[9]  = _mm_mulhi_epi16(y128p[9],ONE_OVER_SQRT2_Q15_128);
-      y128p[9] = _mm_slli_epi16(y128p[9],1);
-      y128p[10] = _mm_mulhi_epi16(y128p[10],ONE_OVER_SQRT2_Q15_128);
-      y128p[10] = _mm_slli_epi16(y128p[10],1);
-      y128p[11] = _mm_mulhi_epi16(y128p[11],ONE_OVER_SQRT2_Q15_128);
-      y128p[11] = _mm_slli_epi16(y128p[11],1);
-      y128p[12] = _mm_mulhi_epi16(y128p[12],ONE_OVER_SQRT2_Q15_128);
-      y128p[12] = _mm_slli_epi16(y128p[12],1);
-      y128p[13] = _mm_mulhi_epi16(y128p[13],ONE_OVER_SQRT2_Q15_128);
-      y128p[13] = _mm_slli_epi16(y128p[13],1);
-      y128p[14] = _mm_mulhi_epi16(y128p[14],ONE_OVER_SQRT2_Q15_128);
-      y128p[14] = _mm_slli_epi16(y128p[14],1);
-      y128p[15] = _mm_mulhi_epi16(y128p[15],ONE_OVER_SQRT2_Q15_128);
-      y128p[15] = _mm_slli_epi16(y128p[15],1);
+      y128p[0]  = mulhi_int16(y128p[0],ONE_OVER_SQRT2_Q15_128);
+      y128p[1]  = mulhi_int16(y128p[1],ONE_OVER_SQRT2_Q15_128);
+      y128p[2]  = mulhi_int16(y128p[2],ONE_OVER_SQRT2_Q15_128);
+      y128p[3]  = mulhi_int16(y128p[3],ONE_OVER_SQRT2_Q15_128);
+      y128p[4]  = mulhi_int16(y128p[4],ONE_OVER_SQRT2_Q15_128);
+      y128p[5]  = mulhi_int16(y128p[5],ONE_OVER_SQRT2_Q15_128);
+      y128p[6]  = mulhi_int16(y128p[6],ONE_OVER_SQRT2_Q15_128);
+      y128p[7]  = mulhi_int16(y128p[7],ONE_OVER_SQRT2_Q15_128);
+      y128p[8]  = mulhi_int16(y128p[8],ONE_OVER_SQRT2_Q15_128);
+      y128p[9]  = mulhi_int16(y128p[9],ONE_OVER_SQRT2_Q15_128);
+      y128p[10] = mulhi_int16(y128p[10],ONE_OVER_SQRT2_Q15_128);
+      y128p[11] = mulhi_int16(y128p[11],ONE_OVER_SQRT2_Q15_128);
+      y128p[12] = mulhi_int16(y128p[12],ONE_OVER_SQRT2_Q15_128);
+      y128p[13] = mulhi_int16(y128p[13],ONE_OVER_SQRT2_Q15_128);
+      y128p[14] = mulhi_int16(y128p[14],ONE_OVER_SQRT2_Q15_128);
+      y128p[15] = mulhi_int16(y128p[15],ONE_OVER_SQRT2_Q15_128);
       y128p+=16;
     }
   }
@@ -2721,9 +3156,9 @@ void idft1536(int16_t *input, int16_t *output)
   //  write_output("out2.m","o2",tmpo[2],2048,1,1);
 
   for (i=0,i2=0; i<1024; i+=8,i2+=4)  {
-    ibfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),((__m128i*)&tmpo[2][i2]),
-          (__m128i*)(output+i),(__m128i*)(output+1024+i),(__m128i*)(output+2048+i),
-          (__m128i*)(twa1536+i),(__m128i*)(twb1536+i));
+    ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
+          (simd_q15_t*)(output+i),(simd_q15_t*)(output+1024+i),(simd_q15_t*)(output+2048+i),
+          (simd_q15_t*)(twa1536+i),(simd_q15_t*)(twb1536+i));
   }
 
 
@@ -2759,9 +3194,9 @@ void dft1536(int16_t *input, int16_t *output)
   //  write_output("out1.m","o1",tmpo[1],2048,1,1);
   //  write_output("out2.m","o2",tmpo[2],2048,1,1);
   for (i=0,i2=0; i<1024; i+=8,i2+=4)  {
-    bfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),(__m128i*)(&tmpo[2][i2]),
-          (__m128i*)(output+i),(__m128i*)(output+1024+i),(__m128i*)(output+2048+i),
-          (__m128i*)(twa1536+i),(__m128i*)(twb1536+i));
+    bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
+          (simd_q15_t*)(output+i),(simd_q15_t*)(output+1024+i),(simd_q15_t*)(output+2048+i),
+          (simd_q15_t*)(twa1536+i),(simd_q15_t*)(twb1536+i));
   }
 
   _mm_empty();
@@ -2811,9 +3246,9 @@ void idft6144(int16_t *input, int16_t *output)
   //  write_output("out2.m","o2",tmpo[2],2048,1,1);
 
   for (i=0,i2=0; i<4096; i+=8,i2+=4)  {
-    ibfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),((__m128i*)&tmpo[2][i2]),
-	   (__m128i*)(output+i),(__m128i*)(output+4096+i),(__m128i*)(output+8192+i),
-	   (__m128i*)(twa6144+i),(__m128i*)(twb6144+i));
+    ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
+	   (simd_q15_t*)(output+i),(simd_q15_t*)(output+4096+i),(simd_q15_t*)(output+8192+i),
+	   (simd_q15_t*)(twa6144+i),(simd_q15_t*)(twb6144+i));
   }
 
   //  write_output("out.m","out",output,6144,1,1);
@@ -2850,9 +3285,9 @@ void dft6144(int16_t *input, int16_t *output)
   //  write_output("out1.m","o1",tmpo[1],2048,1,1);
   //  write_output("out2.m","o2",tmpo[2],2048,1,1);
   for (i=0,i2=0; i<4096; i+=8,i2+=4)  {
-    bfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),(__m128i*)(&tmpo[2][i2]),
-          (__m128i*)(output+i),(__m128i*)(output+4096+i),(__m128i*)(output+8192+i),
-          (__m128i*)(twa6144+i),(__m128i*)(twb6144+i));
+    bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
+          (simd_q15_t*)(output+i),(simd_q15_t*)(output+4096+i),(simd_q15_t*)(output+8192+i),
+          (simd_q15_t*)(twa6144+i),(simd_q15_t*)(twb6144+i));
   }
 
   _mm_empty();
@@ -2889,9 +3324,9 @@ void dft12288(int16_t *input, int16_t *output)
   //  write_output("out1.m","o1",tmpo[1],4096,1,1);
   //  write_output("out2.m","o2",tmpo[2],4096,1,1);
   for (i=0,i2=0; i<8192; i+=8,i2+=4)  {
-    bfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),(__m128i*)(&tmpo[2][i2]),
-          (__m128i*)(output+i),(__m128i*)(output+8192+i),(__m128i*)(output+16384+i),
-          (__m128i*)(twa12288+i),(__m128i*)(twb12288+i));
+    bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
+          (simd_q15_t*)(output+i),(simd_q15_t*)(output+8192+i),(simd_q15_t*)(output+16384+i),
+          (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i));
   }
 
   _mm_empty();
@@ -2922,9 +3357,9 @@ void idft12288(int16_t *input, int16_t *output)
     write_output("out2.m","o2",tmpo[2],4096,1,1);
   */
   for (i=0,i2=0; i<8192; i+=8,i2+=4)  {
-    ibfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),((__m128i*)&tmpo[2][i2]),
-          (__m128i*)(output+i),(__m128i*)(output+8192+i),(__m128i*)(output+16384+i),
-          (__m128i*)(twa12288+i),(__m128i*)(twb12288+i));
+    ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
+          (simd_q15_t*)(output+i),(simd_q15_t*)(output+8192+i),(simd_q15_t*)(output+16384+i),
+          (simd_q15_t*)(twa12288+i),(simd_q15_t*)(twb12288+i));
   }
 
   _mm_empty();
@@ -2972,9 +3407,9 @@ void dft24576(int16_t *input, int16_t *output)
   //    write_output("out1.m","o1",tmpo[1],8192,1,1);
   //    write_output("out2.m","o2",tmpo[2],8192,1,1);
   for (i=0,i2=0; i<16384; i+=8,i2+=4)  {
-    bfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),(__m128i*)(&tmpo[2][i2]),
-          (__m128i*)(output+i),(__m128i*)(output+16384+i),(__m128i*)(output+32768+i),
-          (__m128i*)(twa24576+i),(__m128i*)(twb24576+i));
+    bfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),(simd_q15_t*)(&tmpo[2][i2]),
+          (simd_q15_t*)(output+i),(simd_q15_t*)(output+16384+i),(simd_q15_t*)(output+32768+i),
+          (simd_q15_t*)(twa24576+i),(simd_q15_t*)(twb24576+i));
   }
 
   _mm_empty();
@@ -3014,9 +3449,9 @@ void idft24576(int16_t *input, int16_t *output)
   */
 
   for (i=0,i2=0; i<16384; i+=8,i2+=4)  {
-    ibfly3((__m128i*)(&tmpo[0][i2]),(__m128i*)(&tmpo[1][i2]),((__m128i*)&tmpo[2][i2]),
-          (__m128i*)(output+i),(__m128i*)(output+16384+i),(__m128i*)(output+32768+i),
-          (__m128i*)(twa24576+i),(__m128i*)(twb24576+i));
+    ibfly3((simd_q15_t*)(&tmpo[0][i2]),(simd_q15_t*)(&tmpo[1][i2]),((simd_q15_t*)&tmpo[2][i2]),
+          (simd_q15_t*)(output+i),(simd_q15_t*)(output+16384+i),(simd_q15_t*)(output+32768+i),
+          (simd_q15_t*)(twa24576+i),(simd_q15_t*)(twb24576+i));
   }
 
   _mm_empty();
@@ -3034,70 +3469,70 @@ static int16_t W3_12s[8]__attribute__((aligned(16))) = {0,-32767,0,-32767,0,-327
 static int16_t W4_12s[8]__attribute__((aligned(16))) = {-16383,-28377,-16383,-28377,-16383,-28377,-16383,-28377};
 static int16_t W6_12s[8]__attribute__((aligned(16))) = {-32767,0,-32767,0,-32767,0,-32767,0};
 
-__m128i *W1_12=(__m128i *)W1_12s;
-__m128i *W2_12=(__m128i *)W2_12s;
-__m128i *W3_12=(__m128i *)W3_12s;
-__m128i *W4_12=(__m128i *)W4_12s;
-__m128i *W6_12=(__m128i *)W6_12s;
-
-
-static __m128i norm128;
-
-static inline void dft12f(__m128i *x0,
-                          __m128i *x1,
-                          __m128i *x2,
-                          __m128i *x3,
-                          __m128i *x4,
-                          __m128i *x5,
-                          __m128i *x6,
-                          __m128i *x7,
-                          __m128i *x8,
-                          __m128i *x9,
-                          __m128i *x10,
-                          __m128i *x11,
-                          __m128i *y0,
-                          __m128i *y1,
-                          __m128i *y2,
-                          __m128i *y3,
-                          __m128i *y4,
-                          __m128i *y5,
-                          __m128i *y6,
-                          __m128i *y7,
-                          __m128i *y8,
-                          __m128i *y9,
-                          __m128i *y10,
-                          __m128i *y11) __attribute__((always_inline));
-
-static inline void dft12f(__m128i *x0,
-                          __m128i *x1,
-                          __m128i *x2,
-                          __m128i *x3,
-                          __m128i *x4,
-                          __m128i *x5,
-                          __m128i *x6,
-                          __m128i *x7,
-                          __m128i *x8,
-                          __m128i *x9,
-                          __m128i *x10,
-                          __m128i *x11,
-                          __m128i *y0,
-                          __m128i *y1,
-                          __m128i *y2,
-                          __m128i *y3,
-                          __m128i *y4,
-                          __m128i *y5,
-                          __m128i *y6,
-                          __m128i *y7,
-                          __m128i *y8,
-                          __m128i *y9,
-                          __m128i *y10,
-                          __m128i *y11)
+simd_q15_t *W1_12=(simd_q15_t *)W1_12s;
+simd_q15_t *W2_12=(simd_q15_t *)W2_12s;
+simd_q15_t *W3_12=(simd_q15_t *)W3_12s;
+simd_q15_t *W4_12=(simd_q15_t *)W4_12s;
+simd_q15_t *W6_12=(simd_q15_t *)W6_12s;
+
+
+static simd_q15_t norm128;
+
+static inline void dft12f(simd_q15_t *x0,
+                          simd_q15_t *x1,
+                          simd_q15_t *x2,
+                          simd_q15_t *x3,
+                          simd_q15_t *x4,
+                          simd_q15_t *x5,
+                          simd_q15_t *x6,
+                          simd_q15_t *x7,
+                          simd_q15_t *x8,
+                          simd_q15_t *x9,
+                          simd_q15_t *x10,
+                          simd_q15_t *x11,
+                          simd_q15_t *y0,
+                          simd_q15_t *y1,
+                          simd_q15_t *y2,
+                          simd_q15_t *y3,
+                          simd_q15_t *y4,
+                          simd_q15_t *y5,
+                          simd_q15_t *y6,
+                          simd_q15_t *y7,
+                          simd_q15_t *y8,
+                          simd_q15_t *y9,
+                          simd_q15_t *y10,
+                          simd_q15_t *y11) __attribute__((always_inline));
+
+static inline void dft12f(simd_q15_t *x0,
+                          simd_q15_t *x1,
+                          simd_q15_t *x2,
+                          simd_q15_t *x3,
+                          simd_q15_t *x4,
+                          simd_q15_t *x5,
+                          simd_q15_t *x6,
+                          simd_q15_t *x7,
+                          simd_q15_t *x8,
+                          simd_q15_t *x9,
+                          simd_q15_t *x10,
+                          simd_q15_t *x11,
+                          simd_q15_t *y0,
+                          simd_q15_t *y1,
+                          simd_q15_t *y2,
+                          simd_q15_t *y3,
+                          simd_q15_t *y4,
+                          simd_q15_t *y5,
+                          simd_q15_t *y6,
+                          simd_q15_t *y7,
+                          simd_q15_t *y8,
+                          simd_q15_t *y9,
+                          simd_q15_t *y10,
+                          simd_q15_t *y11)
 {
 
 
-  __m128i tmp_dft12[12];
+  simd_q15_t tmp_dft12[12];
 
-  __m128i *tmp_dft12_ptr = &tmp_dft12[0];
+  simd_q15_t *tmp_dft12_ptr = &tmp_dft12[0];
 
   // msg("dft12\n");
 
@@ -3171,22 +3606,6 @@ static inline void dft12f(__m128i *x0,
         y11,
         W3_12,
         W6_12);
-  /*
-  norm128 = _mm_set1_epi16(dft_norm_table[0]);
-
-  *y0 = _mm_slli_epi16(_mm_mulhi_epi16(*y0,norm128),1);
-  *y1 = _mm_slli_epi16(_mm_mulhi_epi16(*y1,norm128),1);
-  *y2 = _mm_slli_epi16(_mm_mulhi_epi16(*y2,norm128),1);
-  *y3 = _mm_slli_epi16(_mm_mulhi_epi16(*y3,norm128),1);
-  *y4 = _mm_slli_epi16(_mm_mulhi_epi16(*y4,norm128),1);
-  *y5 = _mm_slli_epi16(_mm_mulhi_epi16(*y5,norm128),1);
-  *y6 = _mm_slli_epi16(_mm_mulhi_epi16(*y6,norm128),1);
-  *y7 = _mm_slli_epi16(_mm_mulhi_epi16(*y7,norm128),1);
-  *y8 = _mm_slli_epi16(_mm_mulhi_epi16(*y8,norm128),1);
-  *y9 = _mm_slli_epi16(_mm_mulhi_epi16(*y9,norm128),1);
-  *y10 = _mm_slli_epi16(_mm_mulhi_epi16(*y10,norm128),1);
-  *y11 = _mm_slli_epi16(_mm_mulhi_epi16(*y11,norm128),1);
-  */
 
 }
 
@@ -3196,7 +3615,7 @@ static inline void dft12f(__m128i *x0,
 void dft12(int16_t *x,int16_t *y)
 {
 
-  __m128i *x128 = (__m128i *)x,*y128 = (__m128i *)y;
+  simd_q15_t *x128 = (simd_q15_t *)x,*y128 = (simd_q15_t *)y;
   dft12f(&x128[0],
          &x128[1],
          &x128[2],
@@ -3240,18 +3659,18 @@ static int16_t tw24[88]__attribute__((aligned(16))) = {31650,-8480,31650,-8480,3
                                                        -31650,-8480,-31650,-8480,-31650,-8480,-31650,-8480
                                                       };
 
-//static __m128i ytmp128array[300];
-//static __m128i ytmp128array2[300];
-//static __m128i ytmp128array3[300];
-//static __m128i x2128array[300];
+//static simd_q15_t ytmp128array[300];
+//static simd_q15_t ytmp128array2[300];
+//static simd_q15_t ytmp128array3[300];
+//static simd_q15_t x2128array[300];
 
 void dft24(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *tw128=(__m128i *)&tw24[0];
-  __m128i ytmp128[24];//=&ytmp128array[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *tw128=(simd_q15_t *)&tw24[0];
+  simd_q15_t ytmp128[24];//=&ytmp128array[0];
   int i,j,k;
 
   //  msg("dft24\n");
@@ -3326,10 +3745,10 @@ void dft24(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[1]);
+    norm128 = set1_int16(dft_norm_table[1]);
 
     for (i=0; i<24; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -3367,11 +3786,11 @@ static int16_t twb36[88]__attribute__((aligned(16))) = {30790,-11206,30790,-1120
 void dft36(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa36[0];
-  __m128i *twb128=(__m128i *)&twb36[0];
-  __m128i ytmp128[36];//&ytmp128array[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa36[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb36[0];
+  simd_q15_t ytmp128[36];//&ytmp128array[0];
 
 
   int i,j,k;
@@ -3472,10 +3891,10 @@ void dft36(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[2]);
+    norm128 = set1_int16(dft_norm_table[2]);
 
     for (i=0; i<36; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -3526,12 +3945,12 @@ static int16_t twc48[88]__attribute__((aligned(16))) = {30272,-12539,30272,-1253
 void dft48(int16_t *x, int16_t *y,unsigned char scale_flag)
 {
 
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa48[0];
-  __m128i *twb128=(__m128i *)&twb48[0];
-  __m128i *twc128=(__m128i *)&twc48[0];
-  __m128i ytmp128[48];//=&ytmp128array[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa48[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb48[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc48[0];
+  simd_q15_t ytmp128[48];//=&ytmp128array[0];
   int i,j,k;
 
 
@@ -3668,10 +4087,10 @@ void dft48(int16_t *x, int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag == 1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[3]);
+    norm128 = set1_int16(dft_norm_table[3]);
 
     for (i=0; i<48; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -3732,13 +4151,13 @@ static int16_t twd60[88]__attribute__((aligned(16))) = {29934,-13327,29934,-1332
 void dft60(int16_t *x,int16_t *y,unsigned char scale)
 {
 
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa60[0];
-  __m128i *twb128=(__m128i *)&twb60[0];
-  __m128i *twc128=(__m128i *)&twc60[0];
-  __m128i *twd128=(__m128i *)&twd60[0];
-  __m128i ytmp128[60];//=&ytmp128array[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa60[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb60[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc60[0];
+  simd_q15_t *twd128=(simd_q15_t *)&twd60[0];
+  simd_q15_t ytmp128[60];//=&ytmp128array[0];
   int i,j,k;
 
   dft12f(x128,
@@ -3896,10 +4315,10 @@ void dft60(int16_t *x,int16_t *y,unsigned char scale)
   }
 
   if (scale == 1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[4]);
+    norm128 = set1_int16(dft_norm_table[4]);
 
     for (i=0; i<60; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -3949,12 +4368,12 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *tw128=(__m128i *)&tw72[0];
-  __m128i x2128[72];// = (__m128i *)&x2128array[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *tw128=(simd_q15_t *)&tw72[0];
+  simd_q15_t x2128[72];// = (simd_q15_t *)&x2128array[0];
 
-  __m128i ytmp128[72];//=&ytmp128array2[0];
+  simd_q15_t ytmp128[72];//=&ytmp128array2[0];
 
   for (i=0,j=0; i<36; i++,j+=2) {
     x2128[i]    = x128[j];    // even inputs
@@ -3975,10 +4394,10 @@ void dft72(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[5]);
+    norm128 = set1_int16(dft_norm_table[5]);
 
     for (i=0; i<72; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -4041,11 +4460,11 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag)
 
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *tw128=(__m128i *)&tw96[0];
-  __m128i x2128[96];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[96];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *tw128=(simd_q15_t *)&tw96[0];
+  simd_q15_t x2128[96];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[96];//=&ytmp128array2[0];
 
 
   for (i=0,j=0; i<48; i++,j+=2) {
@@ -4068,10 +4487,10 @@ void dft96(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[6]);
+    norm128 = set1_int16(dft_norm_table[6]);
 
     for (i=0; i<96; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -4159,12 +4578,12 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag)
 
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa108[0];
-  __m128i *twb128=(__m128i *)&twb108[0];
-  __m128i x2128[108];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[108];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa108[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb108[0];
+  simd_q15_t x2128[108];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[108];//=&ytmp128array2[0];
 
 
   for (i=0,j=0; i<36; i++,j+=3) {
@@ -4192,10 +4611,10 @@ void dft108(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[7]);
+    norm128 = set1_int16(dft_norm_table[7]);
 
     for (i=0; i<108; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -4270,11 +4689,11 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag)
 
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *tw128=(__m128i *)&tw120[0];
-  __m128i x2128[120];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[120];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *tw128=(simd_q15_t *)&tw120[0];
+  simd_q15_t x2128[120];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[120];//=&ytmp128array2[0];
 
 
   for (i=0,j=0; i<60; i++,j+=2) {
@@ -4297,10 +4716,10 @@ void dft120(int16_t *x,int16_t *y, unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[8]);
+    norm128 = set1_int16(dft_norm_table[8]);
 
     for (i=0; i<120; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -4411,12 +4830,12 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa144[0];
-  __m128i *twb128=(__m128i *)&twb144[0];
-  __m128i x2128[144];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[144];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa144[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb144[0];
+  simd_q15_t x2128[144];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[144];//=&ytmp128array2[0];
 
 
 
@@ -4444,10 +4863,10 @@ void dft144(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[9]);
+    norm128 = set1_int16(dft_norm_table[9]);
 
     for (i=0; i<144; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -4582,12 +5001,12 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa180[0];
-  __m128i *twb128=(__m128i *)&twb180[0];
-  __m128i x2128[180];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[180];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa180[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb180[0];
+  simd_q15_t x2128[180];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[180];//=&ytmp128array2[0];
 
 
 
@@ -4615,10 +5034,10 @@ void dft180(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[10]);
+    norm128 = set1_int16(dft_norm_table[10]);
 
     for (i=0; i<180; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -4778,13 +5197,13 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa192[0];
-  __m128i *twb128=(__m128i *)&twb192[0];
-  __m128i *twc128=(__m128i *)&twc192[0];
-  __m128i x2128[192];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[192];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa192[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb192[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc192[0];
+  simd_q15_t x2128[192];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[192];//=&ytmp128array2[0];
 
 
 
@@ -4817,10 +5236,10 @@ void dft192(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[11]);
+    norm128 = set1_int16(dft_norm_table[11]);
 
     for (i=0; i<192; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -4979,12 +5398,12 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa216[0];
-  __m128i *twb128=(__m128i *)&twb216[0];
-  __m128i x2128[216];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[216];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa216[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb216[0];
+  simd_q15_t x2128[216];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[216];//=&ytmp128array3[0];
 
 
 
@@ -5012,10 +5431,10 @@ void dft216(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[12]);
+    norm128 = set1_int16(dft_norm_table[12]);
 
     for (i=0; i<216; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -5211,13 +5630,13 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa240[0];
-  __m128i *twb128=(__m128i *)&twb240[0];
-  __m128i *twc128=(__m128i *)&twc240[0];
-  __m128i x2128[240];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[240];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa240[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb240[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc240[0];
+  simd_q15_t x2128[240];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[240];//=&ytmp128array2[0];
 
 
 
@@ -5250,10 +5669,10 @@ void dft240(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[13]);
+    norm128 = set1_int16(dft_norm_table[13]);
 
     for (i=0; i<240; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -5472,12 +5891,12 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa288[0];
-  __m128i *twb128=(__m128i *)&twb288[0];
-  __m128i x2128[288];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[288];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa288[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb288[0];
+  simd_q15_t x2128[288];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[288];//=&ytmp128array3[0];
 
 
 
@@ -5505,10 +5924,10 @@ void dft288(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<288; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -5765,14 +6184,14 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa300[0];
-  __m128i *twb128=(__m128i *)&twb300[0];
-  __m128i *twc128=(__m128i *)&twc300[0];
-  __m128i *twd128=(__m128i *)&twd300[0];
-  __m128i x2128[300];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[300];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa300[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb300[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc300[0];
+  simd_q15_t *twd128=(simd_q15_t *)&twd300[0];
+  simd_q15_t x2128[300];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[300];//=&ytmp128array2[0];
 
 
 
@@ -5810,10 +6229,10 @@ void dft300(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[15]);
+    norm128 = set1_int16(dft_norm_table[15]);
 
     for (i=0; i<300; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -6066,12 +6485,12 @@ static int16_t twb324[107*2*4] = {32742,-1271,32742,-1271,32742,-1271,32742,-127
 void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa324[0];
-  __m128i *twb128=(__m128i *)&twb324[0];
-  __m128i x2128[324];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[324];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa324[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb324[0];
+  simd_q15_t x2128[324];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[324];//=&ytmp128array3[0];
 
 
 
@@ -6099,10 +6518,10 @@ void dft324(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 3
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<324; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -6380,12 +6799,12 @@ static int16_t twb360[119*2*4] = {32747,-1144,32747,-1144,32747,-1144,32747,-114
 void dft360(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 3
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa360[0];
-  __m128i *twb128=(__m128i *)&twb360[0];
-  __m128i x2128[360];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[360];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa360[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb360[0];
+  simd_q15_t x2128[360];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[360];//=&ytmp128array3[0];
 
 
 
@@ -6413,10 +6832,10 @@ void dft360(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 3
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<360; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -6752,13 +7171,13 @@ static int16_t twc384[95*2*4] = {32727,-1608,32727,-1608,32727,-1608,32727,-1608
 void dft384(int16_t *x,int16_t *y,unsigned char scale_flag)  // 96 x 4
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa384[0];
-  __m128i *twb128=(__m128i *)&twb384[0];
-  __m128i *twc128=(__m128i *)&twc384[0];
-  __m128i x2128[384];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[384];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa384[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb384[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc384[0];
+  simd_q15_t x2128[384];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[384];//=&ytmp128array2[0];
 
 
 
@@ -6791,10 +7210,10 @@ void dft384(int16_t *x,int16_t *y,unsigned char scale_flag)  // 96 x 4
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]);
+    norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<384; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -7167,13 +7586,13 @@ static int16_t twc432[107*2*4] = {32735,-1430,32735,-1430,32735,-1430,32735,-143
 void dft432(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 4
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa432[0];
-  __m128i *twb128=(__m128i *)&twb432[0];
-  __m128i *twc128=(__m128i *)&twc432[0];
-  __m128i x2128[432];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[432];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa432[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb432[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc432[0];
+  simd_q15_t x2128[432];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[432];//=&ytmp128array2[0];
 
 
   for (i=0,j=0; i<108; i++,j+=4) {
@@ -7205,10 +7624,10 @@ void dft432(int16_t *x,int16_t *y,unsigned char scale_flag)  // 108 x 4
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]);
+    norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<432; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -7616,13 +8035,13 @@ static int16_t twc480[119*2*4] = {32741,-1287,32741,-1287,32741,-1287,32741,-128
 void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa480[0];
-  __m128i *twb128=(__m128i *)&twb480[0];
-  __m128i *twc128=(__m128i *)&twc480[0];
-  __m128i x2128[480];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[480];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa480[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb480[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc480[0];
+  simd_q15_t x2128[480];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[480];//=&ytmp128array2[0];
 
 
 
@@ -7655,10 +8074,10 @@ void dft480(int16_t *x,int16_t *y,unsigned char scale_flag)  // 120 x 4
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]);
+    norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<480; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -8057,12 +8476,12 @@ static int16_t twb540[179*2*4] = {32758,-763,32758,-763,32758,-763,32758,-763,
 void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa540[0];
-  __m128i *twb128=(__m128i *)&twb540[0];
-  __m128i x2128[540];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[540];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa540[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb540[0];
+  simd_q15_t x2128[540];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[540];//=&ytmp128array3[0];
 
 
 
@@ -8090,10 +8509,10 @@ void dft540(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 3
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<540; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -8516,12 +8935,12 @@ static int16_t twb576[191*2*4] = {32759,-715,32759,-715,32759,-715,32759,-715,
 void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa576[0];
-  __m128i *twb128=(__m128i *)&twb576[0];
-  __m128i x2128[576];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[576];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa576[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb576[0];
+  simd_q15_t x2128[576];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[576];//=&ytmp128array3[0];
 
 
 
@@ -8550,10 +8969,10 @@ void dft576(int16_t *x,int16_t *y,unsigned char scale_flag)  // 192 x 3
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<576; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -8879,11 +9298,11 @@ static int16_t twa600[299*2*4] = {32765,-344,32765,-344,32765,-344,32765,-344,
 void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *tw128=(__m128i *)&twa600[0];
-  __m128i x2128[600];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[600];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *tw128=(simd_q15_t *)&twa600[0];
+  simd_q15_t x2128[600];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[600];//=&ytmp128array2[0];
 
 
   for (i=0,j=0; i<300; i++,j+=2) {
@@ -8906,10 +9325,10 @@ void dft600(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 2
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(ONE_OVER_SQRT2_Q15);
+    norm128 = set1_int16(ONE_OVER_SQRT2_Q15);
 
     for (i=0; i<600; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -9380,12 +9799,12 @@ static int16_t twb648[215*2*4] = {32760,-636,32760,-636,32760,-636,32760,-636,
 void dft648(int16_t *x,int16_t *y,unsigned char scale_flag)  // 216 x 3
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa648[0];
-  __m128i *twb128=(__m128i *)&twb648[0];
-  __m128i x2128[648];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[648];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa648[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb648[0];
+  simd_q15_t x2128[648];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[648];//=&ytmp128array3[0];
 
 
 
@@ -9413,10 +9832,10 @@ void dft648(int16_t *x,int16_t *y,unsigned char scale_flag)  // 216 x 3
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<648; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -10007,13 +10426,13 @@ static int16_t twc720[179*2*4] = {32755,-858,32755,-858,32755,-858,32755,-858,
 void dft720(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 4
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa720[0];
-  __m128i *twb128=(__m128i *)&twb720[0];
-  __m128i *twc128=(__m128i *)&twc720[0];
-  __m128i x2128[720];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[720];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa720[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb720[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc720[0];
+  simd_q15_t x2128[720];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[720];//=&ytmp128array2[0];
 
 
 
@@ -10046,10 +10465,10 @@ void dft720(int16_t *x,int16_t *y,unsigned char scale_flag)  // 180 x 4
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]);
+    norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<720; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -10662,12 +11081,12 @@ static int16_t twb864[287*2*4] = {32763,-477,32763,-477,32763,-477,32763,-477,
 void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa864[0];
-  __m128i *twb128=(__m128i *)&twb864[0];
-  __m128i x2128[864];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[864];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa864[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb864[0];
+  simd_q15_t x2128[864];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[864];//=&ytmp128array3[0];
 
 
 
@@ -10695,10 +11114,10 @@ void dft864(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 3
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<864; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -11335,12 +11754,12 @@ static int16_t twb900[299*2*4] = {32763,-458,32763,-458,32763,-458,32763,-458,
 void dft900(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 3
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa900[0];
-  __m128i *twb128=(__m128i *)&twb900[0];
-  __m128i x2128[900];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[900];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa900[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb900[0];
+  simd_q15_t x2128[900];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[900];//=&ytmp128array3[0];
 
 
 
@@ -11368,10 +11787,10 @@ void dft900(int16_t *x,int16_t *y,unsigned char scale_flag)  // 300 x 3
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<900; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -12141,13 +12560,13 @@ static int16_t twc960[239*2*4] = {32760,-644,32760,-644,32760,-644,32760,-644,
 void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa960[0];
-  __m128i *twb128=(__m128i *)&twb960[0];
-  __m128i *twc128=(__m128i *)&twc960[0];
-  __m128i x2128[960];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[960];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa960[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb960[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc960[0];
+  simd_q15_t x2128[960];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[960];//=&ytmp128array2[0];
 
 
 
@@ -12180,10 +12599,10 @@ void dft960(int16_t *x,int16_t *y,unsigned char scale_flag)  // 240 x 4
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]);
+    norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<960; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -12868,12 +13287,12 @@ static int16_t twb972[323*2*4] = {32764,-424,32764,-424,32764,-424,32764,-424,
 void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa972[0];
-  __m128i *twb128=(__m128i *)&twb972[0];
-  __m128i x2128[972];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[972];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa972[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb972[0];
+  simd_q15_t x2128[972];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[972];//=&ytmp128array3[0];
 
 
 
@@ -12901,10 +13320,10 @@ void dft972(int16_t *x,int16_t *y,unsigned char scale_flag)  // 324 x 3
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<972; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -13661,12 +14080,12 @@ static int16_t twb1080[359*2*4] = {32764,-382,32764,-382,32764,-382,32764,-382,
 void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag)  // 360 x 3
 {
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa1080[0];
-  __m128i *twb128=(__m128i *)&twb1080[0];
-  __m128i x2128[1080];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[1080];//=&ytmp128array3[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa1080[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb1080[0];
+  simd_q15_t x2128[1080];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[1080];//=&ytmp128array3[0];
 
 
 
@@ -13694,10 +14113,10 @@ void dft1080(int16_t *x,int16_t *y,unsigned char scale_flag)  // 360 x 3
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(dft_norm_table[14]);
+    norm128 = set1_int16(dft_norm_table[14]);
 
     for (i=0; i<1080; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -14611,13 +15030,13 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 4
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa1152[0];
-  __m128i *twb128=(__m128i *)&twb1152[0];
-  __m128i *twc128=(__m128i *)&twc1152[0];
-  __m128i x2128[1152];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[1152];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa1152[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb1152[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc1152[0];
+  simd_q15_t x2128[1152];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[1152];//=&ytmp128array2[0];
 
 
 
@@ -14650,10 +15069,10 @@ void dft1152(int16_t *x,int16_t *y,unsigned char scale_flag)  // 288 x 4
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]);
+    norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<1152; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -15569,13 +15988,13 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
 {
 
   int i,j;
-  __m128i *x128=(__m128i *)x;
-  __m128i *y128=(__m128i *)y;
-  __m128i *twa128=(__m128i *)&twa1200[0];
-  __m128i *twb128=(__m128i *)&twb1200[0];
-  __m128i *twc128=(__m128i *)&twc1200[0];
-  __m128i x2128[1200];// = (__m128i *)&x2128array[0];
-  __m128i ytmp128[1200];//=&ytmp128array2[0];
+  simd_q15_t *x128=(simd_q15_t *)x;
+  simd_q15_t *y128=(simd_q15_t *)y;
+  simd_q15_t *twa128=(simd_q15_t *)&twa1200[0];
+  simd_q15_t *twb128=(simd_q15_t *)&twb1200[0];
+  simd_q15_t *twc128=(simd_q15_t *)&twc1200[0];
+  simd_q15_t x2128[1200];// = (simd_q15_t *)&x2128array[0];
+  simd_q15_t ytmp128[1200];//=&ytmp128array2[0];
 
 
 
@@ -15608,10 +16027,10 @@ void dft1200(int16_t *x,int16_t *y,unsigned char scale_flag)
   }
 
   if (scale_flag==1) {
-    norm128 = _mm_set1_epi16(16384);//dft_norm_table[13]);
+    norm128 = set1_int16(16384);//dft_norm_table[13]);
 
     for (i=0; i<1200; i++) {
-      y128[i] = _mm_slli_epi16(_mm_mulhi_epi16(y128[i],norm128),1);
+      y128[i] = mulhi_int16(y128[i],norm128);
     }
   }
 
@@ -15632,14 +16051,14 @@ int main(int argc, char**argv)
 
 
   time_stats_t ts;
-  __m128i x[2048],y[2048],tw0,tw1,tw2,tw3;
+  simd_q15_t x[2048],y[2048],tw0,tw1,tw2,tw3;
 
 
   int i;
 
   set_taus_seed(0);
 
-  /*
+ /* 
     ((int16_t *)&tw0)[0] = 32767;
     ((int16_t *)&tw0)[1] = 0;
     ((int16_t *)&tw0)[2] = 32767;
@@ -15677,8 +16096,13 @@ int main(int argc, char**argv)
     ((int16_t *)&tw3)[7] = 0;
 
     for (i=0;i<300;i++) {
+#if defined(__x86_64__) || defined(__i386__)
       x[i] = _mm_set1_epi32(taus());
       x[i] = _mm_srai_epi16(x[i],4);
+#elif defined(__arm__)
+      x[i] = (int16x8_t)vdupq_n_s32(taus());
+      x[i] = vshrq_n_s16(x[i],4);
+#endif
     }
 
     bfly2_tw1(x,x+1,y,y+1);
@@ -15693,7 +16117,6 @@ int main(int argc, char**argv)
     printf("3(%d,%d) (%d,%d) => (%d,%d) (%d,%d)\n",((int16_t*)&x[0])[0],((int16_t*)&x[0])[1],((int16_t*)&x[1])[0],((int16_t*)&x[1])[1],((int16_t*)&y[0])[6],((int16_t*)&y[0])[7],((int16_t*)&y[1])[6],((int16_t*)&y[1])[7]);
     bfly2(x,x+1,y,y+1, &tw0);
 
-
     bfly3_tw1(x,x+1,x+2,y, y+1,y+2);
     printf("0(%d,%d) (%d,%d) (%d %d) => (%d,%d) (%d,%d) (%d %d)\n",((int16_t*)&x[0])[0],((int16_t*)&x[0])[1],((int16_t*)&x[1])[0],((int16_t*)&x[1])[1],((int16_t*)&x[2])[0],((int16_t*)&x[2])[1],((int16_t*)&y[0])[0],((int16_t*)&y[0])[1],((int16_t*)&y[1])[0],((int16_t*)&y[1])[1],((int16_t*)&y[2])[0],((int16_t*)&y[2])[1]);
     printf("1(%d,%d) (%d,%d) (%d %d) => (%d,%d) (%d,%d) (%d %d)\n",((int16_t*)&x[0])[0],((int16_t*)&x[0])[1],((int16_t*)&x[1])[0],((int16_t*)&x[1])[1],((int16_t*)&x[2])[0],((int16_t*)&x[2])[1],((int16_t*)&y[0])[2],((int16_t*)&y[0])[3],((int16_t*)&y[1])[2],((int16_t*)&y[1])[3],((int16_t*)&y[2])[2],((int16_t*)&y[2])[3]);
@@ -15958,11 +16381,36 @@ int main(int argc, char**argv)
       printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&y[i])[0],((int16_t *)&y[i])[1],((int16_t*)&y[i])[2],((int16_t *)&y[i])[3],((int16_t*)&y[i])[4],((int16_t *)&y[i])[5],((int16_t*)&y[i])[6],((int16_t *)&y[i])[7]);
     printf("\n");
   */
-  for (i=0; i<128; i++) {
-    ((int16_t*)x)[i] = (int16_t)((taus()&0xffff))>>5;
+  memset((void*)&x[0],0,64*4);
+/*
+  for (i=0; i<64; i+=4) {
+     ((int16_t*)x)[i<<1] = 1024;
+     ((int16_t*)x)[1+(i<<1)] = 0;
+     ((int16_t*)x)[2+(i<<1)] = 0;
+     ((int16_t*)x)[3+(i<<1)] = 1024;
+     ((int16_t*)x)[4+(i<<1)] = -1024;
+     ((int16_t*)x)[5+(i<<1)] = 0;
+     ((int16_t*)x)[6+(i<<1)] = 0;
+     ((int16_t*)x)[7+(i<<1)] = -1024;
+  }
+*/
+  for (i=0;i<64;i++) {
+      ((int16_t*)x)[i] = (int16_t)((taus()&0xffff))>>5;
   }
-
   memset((void*)&y[0],0,64*4);
+/*
+  dft16((int16_t *)x,(int16_t *)y);
+  printf("16-point\n");
+  printf("X: ");
+  for (i=0;i<4;i++)
+    printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&x[i])[0],((int16_t *)&x[i])[1],((int16_t*)&x[i])[2],((int16_t *)&x[i])[3],((int16_t*)&x[i])[4],((int16_t*)&x[i])[5],((int16_t*)&x[i])[6],((int16_t*)&x[i])[7]);
+  printf("\nY:");
+
+  for (i=0;i<4;i++)
+    printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&y[i])[0],((int16_t *)&y[i])[1],((int16_t*)&y[i])[2],((int16_t *)&y[i])[3],((int16_t*)&y[i])[4],((int16_t *)&y[i])[5],((int16_t*)&y[i])[6],((int16_t *)&y[i])[7]);
+  printf("\n");
+  exit(-1); 
+*/
   dft64((int16_t *)x,(int16_t *)y,1);
   dft64((int16_t *)x,(int16_t *)y,1);
   dft64((int16_t *)x,(int16_t *)y,1);
@@ -15976,8 +16424,11 @@ int main(int argc, char**argv)
   }
 
   printf("\n\n64-point (%f cycles)\n",(double)ts.diff/(double)ts.trials);
+  write_output("x64.m","x64",x,64,1,1);
+  write_output("y64.m","y64",y,64,1,1);
 
-  /*printf("X: ");
+/*
+  printf("X: ");
   for (i=0;i<16;i++)
     printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&x[i])[0],((int16_t *)&x[i])[1],((int16_t*)&x[i])[2],((int16_t *)&x[i])[3],((int16_t*)&x[i])[4],((int16_t*)&x[i])[5],((int16_t*)&x[i])[6],((int16_t*)&x[i])[7]);
   printf("\nY:");
@@ -15985,11 +16436,17 @@ int main(int argc, char**argv)
   for (i=0;i<16;i++)
     printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&y[i])[0],((int16_t *)&y[i])[1],((int16_t*)&y[i])[2],((int16_t *)&y[i])[3],((int16_t*)&y[i])[4],((int16_t *)&y[i])[5],((int16_t*)&y[i])[6],((int16_t *)&y[i])[7]);
   printf("\n");
-  */
+
+  idft64((int16_t*)y,(int16_t*)x,1);
+  printf("X: ");
+  for (i=0;i<16;i++)
+    printf("%d,%d,%d,%d,%d,%d,%d,%d,",((int16_t*)&x[i])[0],((int16_t *)&x[i])[1],((int16_t*)&x[i])[2],((int16_t *)&x[i])[3],((int16_t*)&x[i])[4],((int16_t*)&x[i])[5],((int16_t*)&x[i])[6],((int16_t*)&x[i])[7]);
+ 
   for (i=0; i<256; i++) {
     ((int16_t*)x)[i] = (int16_t)((taus()&0xffff))>>5;
   }
-
+*/
+  
   memset((void*)&y[0],0,128*4);
   reset_meas(&ts);
 
diff --git a/openair1/PHY/TOOLS/signal_energy.c b/openair1/PHY/TOOLS/signal_energy.c
index 41fd6ca87b..39926a8315 100755
--- a/openair1/PHY/TOOLS/signal_energy.c
+++ b/openair1/PHY/TOOLS/signal_energy.c
@@ -28,19 +28,16 @@
  *******************************************************************************/
 #include "defs.h"
 
-#ifndef EXPRESSMIMO_TARGET
 #include "PHY/sse_intrin.h"
-#endif //EXPRESSMIMO_TARGET
 
 // Compute Energy of a complex signal vector, removing the DC component!
 // input  : points to vector
 // length : length of vector in complex samples
 
 #define shift 4
-#define shift_DC 0
+//#define shift_DC 0
 
-
-#ifndef EXPRESSMIMO_TARGET
+#if defined(__x86_64__) || defined(__i386__)
 #ifdef LOCALIZATION
 int32_t subcarrier_energy(int32_t *input,uint32_t length, int32_t *subcarrier_energy, uint16_t rx_power_correction)
 {
@@ -73,6 +70,7 @@ int32_t subcarrier_energy(int32_t *input,uint32_t length, int32_t *subcarrier_en
   return i;
 }
 #endif
+
 int32_t signal_energy(int32_t *input,uint32_t length)
 {
 
@@ -81,9 +79,6 @@ int32_t signal_energy(int32_t *input,uint32_t length)
   register __m64 mm0,mm1,mm2,mm3;
   __m64 *in = (__m64 *)input;
 
-#ifdef MAIN
-  int16_t *printb;
-#endif
 
   mm0 = _mm_setzero_si64();//pxor(mm0,mm0);
   mm3 = _mm_setzero_si64();//pxor(mm3,mm3);
@@ -95,35 +90,14 @@ int32_t signal_energy(int32_t *input,uint32_t length)
     mm1 = _m_pmaddwd(mm1,mm1);
     mm1 = _m_psradi(mm1,shift);// shift any 32 bits blocs of the word by the value shift
     mm0 = _m_paddd(mm0,mm1);// add the two 64 bits words 4 bytes by 4 bytes
-    //    temp2 = mm0;
-    //    printf("%d %d\n",((int *)&temp2)[0],((int *)&temp2)[1]);
-
-
-    //    printb = (int16_t *)&mm2;
-    //    printf("mm2 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]);
-
-    mm2 = _m_psrawi(mm2,shift_DC);
+    //    mm2 = _m_psrawi(mm2,shift_DC);
     mm3 = _m_paddw(mm3,mm2);// add the two 64 bits words 2 bytes by 2 bytes
-
-    //    printb = (int16_t *)&mm3;
-    //    printf("mm3 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]);
-
   }
 
-  /*
-  #ifdef MAIN
-  printb = (int16_t *)&mm3;
-  printf("%d %d %d %d\n",printb[0],printb[1],printb[2],printb[3]);
-  #endif
-  */
   mm1 = mm0;
-
   mm0 = _m_psrlqi(mm0,32);
-
   mm0 = _m_paddd(mm0,mm1);
-
   temp = _m_to_int(mm0);
-
   temp/=length;
   temp<<=shift;   // this is the average of x^2
 
@@ -132,25 +106,11 @@ int32_t signal_energy(int32_t *input,uint32_t length)
 
   mm2 = _m_psrlqi(mm3,32);
   mm2 = _m_paddw(mm2,mm3);
-
   mm2 = _m_pmaddwd(mm2,mm2);
-
   temp2 = _m_to_int(mm2);
-
   temp2/=(length*length);
-
-  temp2<<=(2*shift_DC);
-#ifdef MAIN
-  printf("E x^2 = %d\n",temp);
-#endif
+  //  temp2<<=(2*shift_DC);
   temp -= temp2;
-#ifdef MAIN
-  printf("(E x)^2=%d\n",temp2);
-#endif
-  _mm_empty();
-  _m_empty();
-
-
 
   return((temp>0)?temp:1);
 }
@@ -214,6 +174,81 @@ int32_t signal_energy_nodc(int32_t *input,uint32_t length)
   return((temp>0)?temp:1);
 }
 
+#elif defined(__arm__)
+
+int32_t signal_energy(int32_t *input,uint32_t length)
+{
+
+  int32_t i;
+  int32_t temp,temp2;
+  register int32x4_t tmpE,tmpDC;
+  int32x2_t tmpE2,tmpDC2;
+  int16x4_t *in = (int16x4_t *)input;
+
+  tmpE  = vdupq_n_s32(0);
+  tmpDC = vdupq_n_s32(0);
+
+  for (i=0; i<length>>1; i++) {
+
+    tmpE = vqaddq_s32(tmpE,vshrq_n_s32(vmull_s16(*in,*in),shift));
+    tmpDC = vaddw_s16(tmpDC,vshr_n_s16(*in++,shift_DC));
+
+  }
+
+  tmpE2 = vpadd_s32(vget_low_s32(tmpE),vget_high_s32(tmpE));
+
+  temp=(vget_lane_s32(tmpE2,0)+vget_lane_s32(tmpE2,1))/length;
+  temp<<=shift;   // this is the average of x^2
+
+  // now remove the DC component
+
+
+  tmpDC2 = vpadd_s32(vget_low_s32(tmpDC),vget_high_s32(tmpDC));
+
+  temp2=(vget_lane_s32(tmpDC2,0)+vget_lane_s32(tmpDC2,1))/(length*length);
+
+  //  temp2<<=(2*shift_DC);
+#ifdef MAIN
+  printf("E x^2 = %d\n",temp);
+#endif
+  temp -= temp2;
+#ifdef MAIN
+  printf("(E x)^2=%d\n",temp2);
+#endif
+
+  return((temp>0)?temp:1);
+}
+
+int32_t signal_energy_nodc(int32_t *input,uint32_t length)
+{
+
+  int32_t i;
+  int32_t temp;
+  register int32x4_t tmpE;
+  int32x2_t tmpE2;
+  int16x4_t *in = (int16x4_t *)input;
+
+  tmpE = vdupq_n_s32(0);
+
+  for (i=0; i<length>>1; i++) {
+
+    tmpE = vqaddq_s32(tmpE,vshrq_n_s32(vmull_s16(*in,*in),shift));
+
+  }
+
+  tmpE2 = vpadd_s32(vget_low_s32(tmpE),vget_high_s32(tmpE));
+
+  temp=(vget_lane_s32(tmpE2,0)+vget_lane_s32(tmpE2,1))/length;
+  temp<<=shift;   // this is the average of x^2
+
+#ifdef MAIN
+  printf("E x^2 = %d\n",temp);
+#endif
+
+  return((temp>0)?temp:1);
+}
+
+#endif
 double signal_energy_fp(double **s_re,double **s_im,uint32_t nb_antennas,uint32_t length,uint32_t offset)
 {
 
@@ -243,13 +278,6 @@ double signal_energy_fp2(struct complex *s,uint32_t length)
 
   return(V/length);
 }
-#else
-
-int32_t signal_energy(int32_t *input,uint32_t length)
-{
-}
-
-#endif
 
 #ifdef MAIN
 #define LENGTH 256
diff --git a/openair1/PHY/TOOLS/time_meas.c b/openair1/PHY/TOOLS/time_meas.c
index 615b437deb..c734db09af 100644
--- a/openair1/PHY/TOOLS/time_meas.c
+++ b/openair1/PHY/TOOLS/time_meas.c
@@ -33,16 +33,7 @@
 
 // global var for openair performance profiler
 int opp_enabled = 0;
-/*
-  double get_cpu_freq_GHz(void) {
 
-  time_stats_t ts;
-  reset_meas(&ts);
-  start_meas(&ts);
-  sleep(1);
-  stop_meas(&ts);
-  return (double)ts.diff/1000000000;
-  }*/
 
 double get_cpu_freq_GHz(void) {
 
diff --git a/openair1/PHY/TOOLS/time_meas.h b/openair1/PHY/TOOLS/time_meas.h
index a72754c77c..2bd6dc489d 100644
--- a/openair1/PHY/TOOLS/time_meas.h
+++ b/openair1/PHY/TOOLS/time_meas.h
@@ -26,15 +26,19 @@
   Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
 
  *******************************************************************************/
-#ifdef OMP
-#include <omp.h>
-#endif
 #include <unistd.h>
 #include <math.h>
+#include <stdint.h>
+#include <time.h>
+#include <errno.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
 // global var to enable openair performance profiler
 extern int opp_enabled;
-
 double cpu_freq_GHz;
+#if defined(__x86_64__) || defined(__i386__)
 
 typedef struct {
 
@@ -46,7 +50,18 @@ typedef struct {
   long long max;
   int trials;
 } time_stats_t;
+#elif defined(__arm__)
+typedef struct {
+  uint32_t in;
+  uint32_t diff_now;
+  uint32_t diff;
+  uint32_t p_time; /*!< \brief absolute process duration */
+  uint32_t diff_square; /*!< \brief process duration square */
+  uint32_t max;
+  int trials;
+} time_stats_t;
 
+#endif
 static inline void start_meas(time_stats_t *ts) __attribute__((always_inline));
 static inline void stop_meas(time_stats_t *ts) __attribute__((always_inline));
 
@@ -74,12 +89,12 @@ static inline unsigned long long rdtsc_oai(void)
 }
 
 #elif defined(__arm__)
-static inline unsigned long long rdtsc_oai(void) __attribute__((always_inline));
-static inline unsigned long long rdtsc_oai(void)
+static inline uint32_t rdtsc_oai(void) __attribute__((always_inline));
+static inline uint32_t rdtsc_oai(void)
 {
   uint32_t r = 0;
   asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(r) );
-  return (unsigned long long)r;
+  return r;
 }
 #endif
 
@@ -88,17 +103,8 @@ static inline void start_meas(time_stats_t *ts)
 
   if (opp_enabled) {
 
-#ifdef OMP
-    int tid;
-
-    tid = omp_get_thread_num();
-
-    if (tid==0)
-#endif
-    {
       ts->trials++;
       ts->in = rdtsc_oai();
-    }
   }
 }
 
@@ -108,24 +114,16 @@ static inline void stop_meas(time_stats_t *ts)
   if (opp_enabled) {
     long long out = rdtsc_oai();
 
-#ifdef OMP
-    int tid;
-    tid = omp_get_thread_num();
-
-    if (tid==0)
-#endif
-    {
       ts->diff_now = (out-ts->in);
       
       ts->diff += (out-ts->in);
       /// process duration is the difference between two clock points
       ts->p_time = (out-ts->in);
-      ts->diff_square += pow((out-ts->in),2);
+      ts->diff_square += (out-ts->in)*(out-ts->in);
 
       if ((out-ts->in) > ts->max)
         ts->max = out-ts->in;
 
-    }
   }
 }
 
@@ -159,11 +157,3 @@ static inline void copy_meas(time_stats_t *dst_ts,time_stats_t *src_ts)
     dst_ts->max=src_ts->max;
   }
 }
-
-/*static inline double get_mean_meas_us(time_stats_t *ts, double cpu_freq_GHz) {
-
-  return (double) ts->diff/ts->trials/cpu_freq_GHz/1000.0;
-
-  }
-*/
-
diff --git a/openair1/PHY/TOOLS/vars.h b/openair1/PHY/TOOLS/vars.h
index 7a5b283a8e..f6edc6bb34 100644
--- a/openair1/PHY/TOOLS/vars.h
+++ b/openair1/PHY/TOOLS/vars.h
@@ -25,4 +25,4 @@
 
   Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
 
- *******************************************************************************/
\ No newline at end of file
+ *******************************************************************************/
diff --git a/openair1/PHY/defs.h b/openair1/PHY/defs.h
index 89fe4c2af2..e90f6ba435 100755
--- a/openair1/PHY/defs.h
+++ b/openair1/PHY/defs.h
@@ -65,28 +65,13 @@
 //use msg in the real-time thread context
 #define msg_nrt printf
 //use msg_nrt in the non real-time context (for initialization, ...)
-#ifdef EXPRESSMIMO_TARGET
-#define malloc16(x) malloc(x)
-#else //EXPRESSMIMO_TARGET
 #define malloc16(x) memalign(16,x)
-#endif //EXPRESSMIMO_TARGET
 #define free16(y,x) free(y)
 #define bigmalloc malloc
 #define bigmalloc16 malloc16
 #define openair_free(y,x) free((y))
 #define PAGE_SIZE 4096
 
-#ifdef EXPRESSMIMO_TARGET
-//! \brief Allocate \c size bytes of memory on the heap and zero it afterwards.
-//! If no more memory is available, this function will terminate the program with an assertion error.
-static inline void* malloc16_clear( size_t size )
-{
-  void* ptr = malloc(size);
-  DevAssert(ptr);
-  memset( ptr, 0, size );
-  return ptr;
-}
-#else //EXPRESSMIMO_TARGET
 //! \brief Allocate \c size bytes of memory on the heap with alignment 16 and zero it afterwards.
 //! If no more memory is available, this function will terminate the program with an assertion error.
 static inline void* malloc16_clear( size_t size )
@@ -96,7 +81,7 @@ static inline void* malloc16_clear( size_t size )
   memset( ptr, 0, size );
   return ptr;
 }
-#endif //EXPRESSMIMO_TARGET
+
 
 
 #define PAGE_MASK 0xfffff000
@@ -119,10 +104,6 @@ static inline void* malloc16_clear( size_t size )
 /// suppress compiler warning for unused arguments
 #define UNUSED(x) (void)x;
 
-#ifdef EXPRESSMIMO_TARGET
-#define Zero_Buffer(x,y) Zero_Buffer_nommx(x,y)
-#endif //EXPRESSMiMO_TARGET
-
 
 #include "spec_defs_top.h"
 #include "impl_defs_top.h"
diff --git a/openair1/SIMULATION/LTE_PHY/dlsim.c b/openair1/SIMULATION/LTE_PHY/dlsim.c
index 5e6929e480..a9fb0fe94c 100644
--- a/openair1/SIMULATION/LTE_PHY/dlsim.c
+++ b/openair1/SIMULATION/LTE_PHY/dlsim.c
@@ -233,7 +233,7 @@ void do_OFDM_mod_l(mod_sym_t **txdataF, int32_t **txdata, uint16_t next_slot, LT
 int main(int argc, char **argv)
 {
 
-  char c;
+  int c;
   int k,i,aa,aarx,aatx;
 
   int s,Kr,Kr_bytes;
@@ -347,12 +347,24 @@ int main(int argc, char **argv)
   LTE_DL_UE_HARQ_t *dlsch0_ue_harq;
   LTE_DL_eNB_HARQ_t *dlsch0_eNB_harq;
   uint8_t Kmimo;
-
+  FILE    *proc_fd = NULL;
+  char buf[64];
 
   opp_enabled=1; // to enable the time meas
 
-  cpu_freq_GHz = (double)get_cpu_freq_GHz();
-
+#if defined(__arm__)
+  proc_fd = fopen("/sys/devices/system/cpu/cpu4/cpufreq/cpuinfo_cur_freq", "r");
+  if(!proc_fd)
+     printf("cannot open /sys/devices/system/cpu/cpu4/cpufreq/cpuinfo_cur_freq");
+  else {
+     while(fgets(buf, 63, proc_fd))
+        printf("%s", buf);
+  }
+  fclose(proc_fd);
+  cpu_freq_GHz = ((double)atof(buf))/1e6;
+#else
+  cpu_freq_GHz = get_cpu_freq_GHz();
+#endif
   printf("Detected cpu_freq %f GHz\n",cpu_freq_GHz);
 
   //signal(SIGSEGV, handler);
@@ -1989,7 +2001,8 @@ int main(int argc, char **argv)
 
       if (input_trch_file==0) {
         for (i=0; i<input_buffer_length0; i++) {
-          input_buffer0[k][i]= (unsigned char)(taus()&0xff);
+          //input_buffer0[k][i] = (unsigned char)(i&0xff);
+          input_buffer0[k][i] = (unsigned char)(taus()&0xff);
         }
 
         for (i=0; i<input_buffer_length1; i++) {
@@ -2690,7 +2703,6 @@ PMI_FEEDBACK:
                 write_output("txsigF1.m","txsF1", &PHY_vars_eNB->lte_eNB_common_vars.txdataF[eNB_id][1][subframe*nsymb*PHY_vars_eNB->lte_frame_parms.ofdm_symbol_size],
                              nsymb*PHY_vars_eNB->lte_frame_parms.ofdm_symbol_size,1,1);
             }
-
             tx_lev = 0;
 
             for (aa=0; aa<PHY_vars_eNB->lte_frame_parms.nb_antennas_tx; aa++) {
diff --git a/openair1/SIMULATION/TOOLS/multipath_channel.c b/openair1/SIMULATION/TOOLS/multipath_channel.c
index 65c8737c81..7a21db2a57 100644
--- a/openair1/SIMULATION/TOOLS/multipath_channel.c
+++ b/openair1/SIMULATION/TOOLS/multipath_channel.c
@@ -1,222 +1,222 @@
-/*******************************************************************************
-    OpenAirInterface
-    Copyright(c) 1999 - 2014 Eurecom
-
-    OpenAirInterface is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-
-    OpenAirInterface is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with OpenAirInterface.The full GNU General Public License is
-   included in this distribution in the file called "COPYING". If not,
-   see <http://www.gnu.org/licenses/>.
-
-  Contact Information
-  OpenAirInterface Admin: openair_admin@eurecom.fr
-  OpenAirInterface Tech : openair_tech@eurecom.fr
-  OpenAirInterface Dev  : openair4g-devel@eurecom.fr
-
-  Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
-
- *******************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include "defs.h"
-#include "SIMULATION/RF/defs.h"
-
-//#define DEBUG_CH
-uint8_t multipath_channel_nosigconv(channel_desc_t *desc)
-{
-
-  random_channel(desc,0);
-  return(1);
-}
-
-#define CHANNEL_SSE
-#ifdef CHANNEL_SSE
-void multipath_channel(channel_desc_t *desc,
-                       double **tx_sig_re,
-                       double **tx_sig_im,
-                       double **rx_sig_re,
-                       double **rx_sig_im,
-                       uint32_t length,
-                       uint8_t keep_channel)
-{
-
-  int i,ii,j,l;
-  int length1, length2, tail;
-  __m128d rx_tmp128_re_f,rx_tmp128_im_f,rx_tmp128_re,rx_tmp128_im, rx_tmp128_1,rx_tmp128_2,rx_tmp128_3,rx_tmp128_4,tx128_re,tx128_im,ch128_x,ch128_y,pathloss128;
-
-  double path_loss = pow(10,desc->path_loss_dB/20);
-  int dd = abs(desc->channel_offset);
-
-  pathloss128 = _mm_set1_pd(path_loss);
-
-#ifdef DEBUG_CH
-  printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length);
-#endif
-
-  if (keep_channel) {
-    // do nothing - keep channel
-  } else {
-    random_channel(desc,0);
-  }
-
-  start_meas(&desc->convolution);
-
-#ifdef DEBUG_CH
-
-  for (l = 0; l<(int)desc->channel_length; l++) {
-    printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y);
-  }
-
-  printf("\n");
-#endif
-
-  tail = ((int)length-dd)%2;
-
-  if(tail)
-    length1 = ((int)length-dd)-1;
-  else
-    length1 = ((int)length-dd);
-
-  length2 = length1/2;
-
-  for (i=0; i<length2; i++) { //
-    for (ii=0; ii<desc->nb_rx; ii++) {
-      // rx_tmp.x = 0;
-      // rx_tmp.y = 0;
-      rx_tmp128_re_f = _mm_setzero_pd();
-      rx_tmp128_im_f = _mm_setzero_pd();
-
-      for (j=0; j<desc->nb_tx; j++) {
-        for (l = 0; l<(int)desc->channel_length; l++) {
-          if ((i>=0) && (i-l)>=0) { //SIMD correct only if length1 > 2*channel_length...which is almost always satisfied
-            // tx.x = tx_sig_re[j][i-l];
-            // tx.y = tx_sig_im[j][i-l];
-            tx128_re = _mm_loadu_pd(&tx_sig_re[j][2*i-l]); // tx_sig_re[j][i-l+1], tx_sig_re[j][i-l]
-            tx128_im = _mm_loadu_pd(&tx_sig_im[j][2*i-l]);
-          } else {
-            //tx.x =0;
-            //tx.y =0;
-            tx128_re = _mm_setzero_pd();
-            tx128_im = _mm_setzero_pd();
-          }
-
-          ch128_x = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].x);
-          ch128_y = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].y);
-          //  rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y);
-          //  rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y);
-          rx_tmp128_1 = _mm_mul_pd(tx128_re,ch128_x);
-          rx_tmp128_2 = _mm_mul_pd(tx128_re,ch128_y);
-          rx_tmp128_3 = _mm_mul_pd(tx128_im,ch128_x);
-          rx_tmp128_4 = _mm_mul_pd(tx128_im,ch128_y);
-          rx_tmp128_re = _mm_sub_pd(rx_tmp128_1,rx_tmp128_4);
-          rx_tmp128_im = _mm_add_pd(rx_tmp128_2,rx_tmp128_3);
-          rx_tmp128_re_f = _mm_add_pd(rx_tmp128_re_f,rx_tmp128_re);
-          rx_tmp128_im_f = _mm_add_pd(rx_tmp128_im_f,rx_tmp128_im);
-        } //l
-      }  // j
-
-      //rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss;
-      //rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss;
-      rx_tmp128_re_f = _mm_mul_pd(rx_tmp128_re_f,pathloss128);
-      rx_tmp128_im_f = _mm_mul_pd(rx_tmp128_im_f,pathloss128);
-      _mm_storeu_pd(&rx_sig_re[ii][2*i+dd],rx_tmp128_re_f); // max index: length-dd -1 + dd = length -1
-      _mm_storeu_pd(&rx_sig_im[ii][2*i+dd],rx_tmp128_im_f);
-      /*
-      if ((ii==0)&&((i%32)==0)) {
-      printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]);
-      }
-      */
-      //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]);
-      //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]);
-
-    } // ii
-  } // i
-
-  stop_meas(&desc->convolution);
-
-}
-
-#else
-void multipath_channel(channel_desc_t *desc,
-                       double **tx_sig_re,
-                       double **tx_sig_im,
-                       double **rx_sig_re,
-                       double **rx_sig_im,
-                       uint32_t length,
-                       uint8_t keep_channel)
-{
-
-  int i,ii,j,l;
-  struct complex rx_tmp,tx;
-
-  double path_loss = pow(10,desc->path_loss_dB/20);
-  int dd;
-  dd = abs(desc->channel_offset);
-
-#ifdef DEBUG_CH
-  printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length);
-#endif
-
-  if (keep_channel) {
-    // do nothing - keep channel
-  } else {
-    random_channel(desc,0);
-  }
-
-#ifdef DEBUG_CH
-
-  for (l = 0; l<(int)desc->channel_length; l++) {
-    printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y);
-  }
-
-  printf("\n");
-#endif
-
-  for (i=0; i<((int)length-dd); i++) {
-    for (ii=0; ii<desc->nb_rx; ii++) {
-      rx_tmp.x = 0;
-      rx_tmp.y = 0;
-
-      for (j=0; j<desc->nb_tx; j++) {
-        for (l = 0; l<(int)desc->channel_length; l++) {
-          if ((i>=0) && (i-l)>=0) {
-            tx.x = tx_sig_re[j][i-l];
-            tx.y = tx_sig_im[j][i-l];
-          } else {
-            tx.x =0;
-            tx.y =0;
-          }
-
-          rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y);
-          rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y);
-        } //l
-      }  // j
-
-      rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss;
-      rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss;
-      /*
-      if ((ii==0)&&((i%32)==0)) {
-      printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]);
-      }
-      */
-      //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]);
-      //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]);
-
-    } // ii
-  } // i
-}
-#endif
-
-
+/*******************************************************************************
+    OpenAirInterface
+    Copyright(c) 1999 - 2014 Eurecom
+
+    OpenAirInterface is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+
+    OpenAirInterface is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenAirInterface.The full GNU General Public License is
+   included in this distribution in the file called "COPYING". If not,
+   see <http://www.gnu.org/licenses/>.
+
+  Contact Information
+  OpenAirInterface Admin: openair_admin@eurecom.fr
+  OpenAirInterface Tech : openair_tech@eurecom.fr
+  OpenAirInterface Dev  : openair4g-devel@eurecom.fr
+
+  Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
+
+ *******************************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include "defs.h"
+#include "SIMULATION/RF/defs.h"
+
+//#define DEBUG_CH
+uint8_t multipath_channel_nosigconv(channel_desc_t *desc)
+{
+
+  random_channel(desc,0);
+  return(1);
+}
+
+//#define CHANNEL_SSE
+#ifdef CHANNEL_SSE
+void multipath_channel(channel_desc_t *desc,
+                       double **tx_sig_re,
+                       double **tx_sig_im,
+                       double **rx_sig_re,
+                       double **rx_sig_im,
+                       uint32_t length,
+                       uint8_t keep_channel)
+{
+
+  int i,ii,j,l;
+  int length1, length2, tail;
+  __m128d rx_tmp128_re_f,rx_tmp128_im_f,rx_tmp128_re,rx_tmp128_im, rx_tmp128_1,rx_tmp128_2,rx_tmp128_3,rx_tmp128_4,tx128_re,tx128_im,ch128_x,ch128_y,pathloss128;
+
+  double path_loss = pow(10,desc->path_loss_dB/20);
+  int dd = abs(desc->channel_offset);
+
+  pathloss128 = _mm_set1_pd(path_loss);
+
+#ifdef DEBUG_CH
+  printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length);
+#endif
+
+  if (keep_channel) {
+    // do nothing - keep channel
+  } else {
+    random_channel(desc,0);
+  }
+
+  start_meas(&desc->convolution);
+
+#ifdef DEBUG_CH
+
+  for (l = 0; l<(int)desc->channel_length; l++) {
+    printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y);
+  }
+
+  printf("\n");
+#endif
+
+  tail = ((int)length-dd)%2;
+
+  if(tail)
+    length1 = ((int)length-dd)-1;
+  else
+    length1 = ((int)length-dd);
+
+  length2 = length1/2;
+
+  for (i=0; i<length2; i++) { //
+    for (ii=0; ii<desc->nb_rx; ii++) {
+      // rx_tmp.x = 0;
+      // rx_tmp.y = 0;
+      rx_tmp128_re_f = _mm_setzero_pd();
+      rx_tmp128_im_f = _mm_setzero_pd();
+
+      for (j=0; j<desc->nb_tx; j++) {
+        for (l = 0; l<(int)desc->channel_length; l++) {
+          if ((i>=0) && (i-l)>=0) { //SIMD correct only if length1 > 2*channel_length...which is almost always satisfied
+            // tx.x = tx_sig_re[j][i-l];
+            // tx.y = tx_sig_im[j][i-l];
+            tx128_re = _mm_loadu_pd(&tx_sig_re[j][2*i-l]); // tx_sig_re[j][i-l+1], tx_sig_re[j][i-l]
+            tx128_im = _mm_loadu_pd(&tx_sig_im[j][2*i-l]);
+          } else {
+            //tx.x =0;
+            //tx.y =0;
+            tx128_re = _mm_setzero_pd();
+            tx128_im = _mm_setzero_pd();
+          }
+
+          ch128_x = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].x);
+          ch128_y = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].y);
+          //  rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y);
+          //  rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y);
+          rx_tmp128_1 = _mm_mul_pd(tx128_re,ch128_x);
+          rx_tmp128_2 = _mm_mul_pd(tx128_re,ch128_y);
+          rx_tmp128_3 = _mm_mul_pd(tx128_im,ch128_x);
+          rx_tmp128_4 = _mm_mul_pd(tx128_im,ch128_y);
+          rx_tmp128_re = _mm_sub_pd(rx_tmp128_1,rx_tmp128_4);
+          rx_tmp128_im = _mm_add_pd(rx_tmp128_2,rx_tmp128_3);
+          rx_tmp128_re_f = _mm_add_pd(rx_tmp128_re_f,rx_tmp128_re);
+          rx_tmp128_im_f = _mm_add_pd(rx_tmp128_im_f,rx_tmp128_im);
+        } //l
+      }  // j
+
+      //rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss;
+      //rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss;
+      rx_tmp128_re_f = _mm_mul_pd(rx_tmp128_re_f,pathloss128);
+      rx_tmp128_im_f = _mm_mul_pd(rx_tmp128_im_f,pathloss128);
+      _mm_storeu_pd(&rx_sig_re[ii][2*i+dd],rx_tmp128_re_f); // max index: length-dd -1 + dd = length -1
+      _mm_storeu_pd(&rx_sig_im[ii][2*i+dd],rx_tmp128_im_f);
+      /*
+      if ((ii==0)&&((i%32)==0)) {
+      printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]);
+      }
+      */
+      //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]);
+      //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]);
+
+    } // ii
+  } // i
+
+  stop_meas(&desc->convolution);
+
+}
+
+#else
+void multipath_channel(channel_desc_t *desc,
+                       double **tx_sig_re,
+                       double **tx_sig_im,
+                       double **rx_sig_re,
+                       double **rx_sig_im,
+                       uint32_t length,
+                       uint8_t keep_channel)
+{
+
+  int i,ii,j,l;
+  struct complex rx_tmp,tx;
+
+  double path_loss = pow(10,desc->path_loss_dB/20);
+  int dd;
+  dd = abs(desc->channel_offset);
+
+#ifdef DEBUG_CH
+  printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length);
+#endif
+
+  if (keep_channel) {
+    // do nothing - keep channel
+  } else {
+    random_channel(desc,0);
+  }
+
+#ifdef DEBUG_CH
+
+  for (l = 0; l<(int)desc->channel_length; l++) {
+    printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y);
+  }
+
+  printf("\n");
+#endif
+
+  for (i=0; i<((int)length-dd); i++) {
+    for (ii=0; ii<desc->nb_rx; ii++) {
+      rx_tmp.x = 0;
+      rx_tmp.y = 0;
+
+      for (j=0; j<desc->nb_tx; j++) {
+        for (l = 0; l<(int)desc->channel_length; l++) {
+          if ((i>=0) && (i-l)>=0) {
+            tx.x = tx_sig_re[j][i-l];
+            tx.y = tx_sig_im[j][i-l];
+          } else {
+            tx.x =0;
+            tx.y =0;
+          }
+
+          rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y);
+          rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y);
+        } //l
+      }  // j
+
+      rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss;
+      rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss;
+      /*
+      if ((ii==0)&&((i%32)==0)) {
+      printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]);
+      }
+      */
+      //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]);
+      //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]);
+
+    } // ii
+  } // i
+}
+#endif
+
+
-- 
GitLab