diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt
index e4d3acf4b1c976540a95e6cbd033bfa26b9d9ecd..0bf63cfb1ac626240f429314bf0ca312c72e56b2 100644
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -1034,6 +1034,7 @@ set(PHY_SRC
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_modulation.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_demodulation.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
+  ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_llr_computation_avx2.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/power_control.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_decoding.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_scrambling.c
diff --git a/openair1/PHY/INIT/lte_init.c b/openair1/PHY/INIT/lte_init.c
index d0556a0235c042a9805cd595af26a51a2ae1cbc2..de83506bb31aed698f0c41fa95ff49468c204955 100755
--- a/openair1/PHY/INIT/lte_init.c
+++ b/openair1/PHY/INIT/lte_init.c
@@ -948,6 +948,11 @@ void phy_config_dedicated_ue(uint8_t Mod_id,int CC_id,uint8_t eNB_id,
   // fill cqi parameters for periodic CQI reporting
   get_cqipmiri_params(phy_vars_ue,eNB_id);
 
+  // disable MIB SIB decoding once we are on connected mode
+  LOG_I(PHY,"Disabling SIB MIB decoding \n");
+  phy_vars_ue->decode_SIB = 0;
+  phy_vars_ue->decode_MIB = 0;
+
 }
 
 void  phy_config_cba_rnti (module_id_t Mod_id,int CC_id,eNB_flag_t eNB_flag, uint8_t index, rnti_t cba_rnti, uint8_t cba_group_id, uint8_t num_active_cba_groups)
@@ -1306,6 +1311,10 @@ int phy_init_lte_ue(PHY_VARS_UE *ue,
   ue->high_speed_flag = 1;
   ue->ch_est_alpha    = 24576;
 
+  // enable MIB/SIB decoding by default
+  ue->decode_MIB = 1;
+  ue->decode_SIB = 1;
+
   init_prach_tables(839);
 
 
diff --git a/openair1/PHY/LTE_ESTIMATION/lte_dl_bf_channel_estimation.c b/openair1/PHY/LTE_ESTIMATION/lte_dl_bf_channel_estimation.c
index 02b5102ae544a16b04e527ac9aed30fc1fc84009..7e112342ce15980c0e199cc930c6428ea8347db7 100644
--- a/openair1/PHY/LTE_ESTIMATION/lte_dl_bf_channel_estimation.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_dl_bf_channel_estimation.c
@@ -51,10 +51,10 @@ int lte_dl_bf_channel_estimation(PHY_VARS_UE *phy_vars_ue,
   int uespec_pilot[300];
 
   LTE_DL_FRAME_PARMS *frame_parms = &phy_vars_ue->frame_parms;
-  LTE_UE_DLSCH_t **dlsch_ue       = phy_vars_ue->dlsch[eNB_id];
+  LTE_UE_DLSCH_t **dlsch_ue       = phy_vars_ue->dlsch[(Ns>>1)&0x1][eNB_id];
   LTE_DL_UE_HARQ_t *dlsch0_harq; 
 
-  harq_pid    = dlsch_ue[0]->current_harq_pid; 
+  harq_pid    = dlsch_ue[0]->current_harq_pid;
   dlsch0_harq = dlsch_ue[0]->harq_processes[harq_pid];
 
   if (((frame_parms->Ncp == NORMAL) && (symbol>=7)) ||
diff --git a/openair1/PHY/LTE_TRANSPORT/dci.c b/openair1/PHY/LTE_TRANSPORT/dci.c
index 49fe90bf15c0f58668d8e12f010aa4b46a2fe842..af25e33ff74dbc70ea7d4d46a6cdfcbf41fad1a1 100644
--- a/openair1/PHY/LTE_TRANSPORT/dci.c
+++ b/openair1/PHY/LTE_TRANSPORT/dci.c
@@ -2893,15 +2893,15 @@ void dci_decoding_procedure0(LTE_UE_PDCCH **pdcch_vars,
           break;
 
         case 2:
-          *CCEmap|=(0x03<<(CCEind&0x1f));
+          *CCEmap|=(1<<(CCEind&0x1f));
           break;
 
         case 4:
-          *CCEmap|=(0x0f<<(CCEind&0x1f));
+          *CCEmap|=(1<<(CCEind&0x1f));
           break;
 
         case 8:
-          *CCEmap|=(0xff<<(CCEind&0x1f));
+          *CCEmap|=(1<<(CCEind&0x1f));
           break;
         }
 
@@ -3149,7 +3149,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0) ,
                             ra_rnti,
 			    P_RNTI,
                             2,
@@ -3177,7 +3177,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             2,
@@ -3209,7 +3209,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
 			    P_RNTI,
                             ra_rnti,
                             3,
@@ -3237,7 +3237,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
 			    3,
@@ -3271,7 +3271,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                           eNB_id,
                           frame_parms,
                           mi,
-                          SI_RNTI,
+                          ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                           ra_rnti,
 			  P_RNTI,
 			  0,
@@ -3300,7 +3300,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                           eNB_id,
                           frame_parms,
                           mi,
-                          SI_RNTI,
+                          ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                           ra_rnti,
 			  P_RNTI,
 			  1,
@@ -3333,7 +3333,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                           eNB_id,
                           frame_parms,
                           mi,
-                          SI_RNTI,
+                          ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                           ra_rnti,
 			  P_RNTI,
                           2,
@@ -3362,7 +3362,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                           eNB_id,
                           frame_parms,
                           mi,
-                          SI_RNTI,
+                          ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                           ra_rnti,
 			  P_RNTI,
                           3,
@@ -3395,7 +3395,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             0,
@@ -3426,7 +3426,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             1,
@@ -3458,7 +3458,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
 			    2,
@@ -3490,7 +3490,7 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             3,
@@ -3518,16 +3518,18 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
   } else if (tmode == 3) {
 
 
+    LOG_D(PHY," Now check UE_SPEC format 2A_2A search aggregation 1\n");
     // Now check UE_SPEC format 2A_2A search spaces at aggregation 1
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
-			    P_RNTI,
+                            P_RNTI,
                             0,
                             format1A,
                             format1A,
@@ -3542,21 +3544,25 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             &CCEmap1,
                             &CCEmap2);
 
+    LOG_D(PHY," format 2A_2A search CCEmap0 %x, format0_found %d, format_c_found %d \n", CCEmap0, format0_found, format_c_found);
     if ((CCEmap0==0xffff)||
         ((format0_found==1)&&(format_c_found==1)))
       return(dci_cnt);
 
+    LOG_D(PHY," format 2A_2A search dci_cnt %d, old_dci_cn t%d \n", dci_cnt, old_dci_cnt);
     if (dci_cnt>old_dci_cnt)
       return(dci_cnt);
 
     // Now check UE_SPEC format 2 search spaces at aggregation 2
+    LOG_D(PHY," Now check UE_SPEC format 2A_2A search aggregation 2\n");
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             1,
@@ -3577,19 +3583,22 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
         ((format0_found==1)&&(format_c_found==1)))
       return(dci_cnt);
 
+    LOG_D(PHY," format 2A_2A search dci_cnt %d, old_dci_cn t%d \n", dci_cnt, old_dci_cnt);
     if (dci_cnt>old_dci_cnt)
       return(dci_cnt);
 
     // Now check UE_SPEC format 2_2A search spaces at aggregation 4
+    LOG_D(PHY," Now check UE_SPEC format 2_2A search spaces at aggregation 4 \n");
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
-			    P_RNTI,
+                            P_RNTI,
                             2,
                             format1A,
                             format1A,
@@ -3608,18 +3617,21 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
         ((format0_found==1)&&(format_c_found==1)))
       return(dci_cnt);
 
+    LOG_D(PHY," format 2A_2A search dci_cnt %d, old_dci_cn t%d \n", dci_cnt, old_dci_cnt);
     if (dci_cnt>old_dci_cnt)
       return(dci_cnt);
 
     //#ifdef ALL_AGGREGATION
     // Now check UE_SPEC format 2_2A search spaces at aggregation 8
+    LOG_D(PHY," Now check UE_SPEC format 2_2A search spaces at aggregation 8 \n");
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             3,
@@ -3636,16 +3648,24 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
                             &CCEmap1,
                             &CCEmap2);
     //#endif
+    if ((CCEmap0==0xffff)||
+        ((format0_found==1)&&(format_c_found==1)))
+      return(dci_cnt);
+
+    LOG_D(PHY," format 2A_2A search dci_cnt %d, old_dci_cn t%d \n", dci_cnt, old_dci_cnt);
+    if (dci_cnt>old_dci_cnt)
+      return(dci_cnt);
   } else if (tmode == 4) {
 
     // Now check UE_SPEC format 2_2A search spaces at aggregation 1
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             0,
@@ -3670,13 +3690,14 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
       return(dci_cnt);
 
     // Now check UE_SPEC format 2 search spaces at aggregation 2
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             1,
@@ -3701,13 +3722,14 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
       return(dci_cnt);
 
     // Now check UE_SPEC format 2_2A search spaces at aggregation 4
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             2,
@@ -3733,13 +3755,14 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
 
     //#ifdef ALL_AGGREGATION
     // Now check UE_SPEC format 2_2A search spaces at aggregation 8
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             3,
@@ -3762,13 +3785,14 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
 #ifdef DEBUG_DCI_DECODING
     LOG_I(PHY," MU-MIMO check UE_SPEC format 1E_2A_M10PRB\n");
 #endif
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             0,
@@ -3794,13 +3818,14 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
       return(dci_cnt);
 
     // Now check UE_SPEC format 1E_2A_M10PRB search spaces aggregation 2
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             1,
@@ -3825,13 +3850,14 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
       return(dci_cnt);
 
     // Now check UE_SPEC format 1E_2A_M10PRB search spaces aggregation 4
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             2,
@@ -3858,13 +3884,14 @@ uint16_t dci_decoding_procedure(PHY_VARS_UE *ue,
     //#ifdef ALL_AGGREGATION
 
     // Now check UE_SPEC format 1E_2A_M10PRB search spaces at aggregation 8
+    old_dci_cnt=dci_cnt;
     dci_decoding_procedure0(pdcch_vars,0,mode,
                             subframe,
                             dci_alloc,
                             eNB_id,
                             frame_parms,
                             mi,
-                            SI_RNTI,
+                            ((ue->decode_SIB == 1) ? SI_RNTI : 0),
                             ra_rnti,
 			    P_RNTI,
                             3,
diff --git a/openair1/PHY/LTE_TRANSPORT/dci_tools.c b/openair1/PHY/LTE_TRANSPORT/dci_tools.c
index 9f4749cdc60a69f8bf3d7281a4a533891d49dcae..c4e379b45ab1b42c1db7b9622f67402be0f0b737 100644
--- a/openair1/PHY/LTE_TRANSPORT/dci_tools.c
+++ b/openair1/PHY/LTE_TRANSPORT/dci_tools.c
@@ -4763,22 +4763,26 @@ int check_dci_format1_1a_coherency(DCI_format_t dci_format,
         uint16_t si_rnti,
         uint16_t ra_rnti,
         uint16_t p_rnti,
+        uint32_t frame,
+        uint8_t  subframe,
         DCI_INFO_EXTRACTED_t *pdci_info_extarcted,
         LTE_DL_UE_HARQ_t *pdlsch0_harq)
 {
     uint8_t  harq_pid  = pdci_info_extarcted->harq_pid;
     uint32_t rballoc   = pdci_info_extarcted->rballoc;
     uint8_t  mcs1      = pdci_info_extarcted->mcs1;
-    uint8_t  rv1       = pdci_info_extarcted->rv1;
-    uint8_t  ndi1      = pdci_info_extarcted->ndi1;
     uint8_t  TPC       = pdci_info_extarcted->TPC;
     uint8_t  rah       = pdci_info_extarcted->rah;
+#ifdef DEBUG_DCI
+    uint8_t  rv1       = pdci_info_extarcted->rv1;
+    uint8_t  ndi1      = pdci_info_extarcted->ndi1;
+#endif
 
     uint8_t  NPRB    = 0;
     long long int RIV_max = 0;
 
 #ifdef DEBUG_DCI
-    LOG_I(PHY,"[DCI-FORMAT-1-1A] dci_format %d\n", dci_format);
+    LOG_I(PHY,"[DCI-FORMAT-1-1A] AbsSubframe %d.%d dci_format %d\n", frame, subframe, dci_format);
     LOG_I(PHY,"[DCI-FORMAT-1-1A] rnti       %x\n",  rnti);
     LOG_I(PHY,"[DCI-FORMAT-1-1A] harq_pid   %d\n", harq_pid);
     LOG_I(PHY,"[DCI-FORMAT-1-1A] rah        %d\n", rah);
@@ -4983,24 +4987,30 @@ int check_dci_format2_2a_coherency(DCI_format_t dci_format,
     uint8_t  rv2  = pdci_info_extarcted->rv2;
     uint8_t  harq_pid = pdci_info_extarcted->harq_pid;
     uint32_t rballoc  = pdci_info_extarcted->rballoc;
+
+#ifdef DEBUG_DCI
     uint8_t  ndi1     = pdci_info_extarcted->ndi1;
     uint8_t  ndi2     = pdci_info_extarcted->ndi2;
+#endif
 
     uint8_t  NPRB    = 0;
     long long RIV_max = 0;
 
+#ifdef DEBUG_DCI
     LOG_I(PHY, "extarcted dci - dci_format %d \n", dci_format);
+    LOG_I(PHY, "extarcted dci - rnti       %d \n", rnti);
     LOG_I(PHY, "extarcted dci - rah        %d \n", rah);
     LOG_I(PHY, "extarcted dci - mcs1       %d \n", mcs1);
     LOG_I(PHY, "extarcted dci - mcs2       %d \n", mcs2);
     LOG_I(PHY, "extarcted dci - rv1        %d \n", rv1);
     LOG_I(PHY, "extarcted dci - rv2        %d \n", rv2);
-    LOG_I(PHY, "extarcted dci - ndi1       %d \n", ndi1);
-    LOG_I(PHY, "extarcted dci - ndi2       %d \n", ndi2);
+    //LOG_I(PHY, "extarcted dci - ndi1       %d \n", ndi1);
+    //LOG_I(PHY, "extarcted dci - ndi2       %d \n", ndi2);
     LOG_I(PHY, "extarcted dci - rballoc    %x \n", rballoc);
-    LOG_I(PHY, "extarcted dci - harq pif   %d \n", harq_pid);
+    LOG_I(PHY, "extarcted dci - harq pid   %d \n", harq_pid);
     LOG_I(PHY, "extarcted dci - round0     %d \n", pdlsch0_harq->round);
     LOG_I(PHY, "extarcted dci - round1     %d \n", pdlsch1_harq->round);
+#endif
 
     // I- check dci content minimum coherency
     if(harq_pid >8)
@@ -5043,6 +5053,21 @@ int check_dci_format2_2a_coherency(DCI_format_t dci_format,
     }*/
 
 
+    if((pdlsch0_harq->round == 0) && (rv1 > 0))
+    {
+      // DCI false detection
+        LOG_I(PHY,"bad rv1\n");
+      return(0);
+    }
+
+    if((pdlsch1_harq->round == 0) && (rv2 > 0))
+    {
+      // DCI false detection
+        LOG_I(PHY,"bad rv2\n");
+      return(0);
+    }
+
+
     switch (N_RB_DL) {
     case 6:
         if (rah == 0)
@@ -5606,6 +5631,7 @@ void compute_precoding_info_format2A(uint8_t tpmi,
 void prepare_dl_decoding_format2_2A(DCI_format_t dci_format,
                                     DCI_INFO_EXTRACTED_t *pdci_info_extarcted,
                                     LTE_DL_FRAME_PARMS *frame_parms,
+                                    uint16_t rnti,
                                     uint8_t subframe,
                                     LTE_DL_UE_HARQ_t *dlsch0_harq,
                                     LTE_DL_UE_HARQ_t *dlsch1_harq,
@@ -5663,15 +5689,17 @@ void prepare_dl_decoding_format2_2A(DCI_format_t dci_format,
         dlsch1_harq->dl_power_off = 1;
 
         pdlsch0->current_harq_pid = harq_pid;
-        pdlsch0->harq_ack[subframe].harq_id = harq_pid;
+        pdlsch0->harq_ack[subframe].harq_id     = harq_pid;
         pdlsch1->current_harq_pid = harq_pid;
-        pdlsch1->harq_ack[subframe].harq_id = harq_pid;
+        pdlsch1->harq_ack[subframe].harq_id     = harq_pid;
 
         // assume two CW are active
         dlsch0_harq->status   = ACTIVE;
         dlsch1_harq->status   = ACTIVE;
         pdlsch0->active = 1;
         pdlsch1->active = 1;
+        pdlsch0->rnti = rnti;
+        pdlsch1->rnti = rnti;
 
 
       if (TB0_active && TB1_active && tbswap==1) {
@@ -5682,20 +5710,16 @@ void prepare_dl_decoding_format2_2A(DCI_format_t dci_format,
       if (TB0_active==0) {
         dlsch0_harq->status = SCH_IDLE;
         pdlsch0->active     = 0;
-#ifdef DEBUG_HARQ
+  #ifdef DEBUG_HARQ
         printf("[DCI UE]: TB0 is deactivated, retransmit TB1 transmit in TM6\n");
-#endif
+  #endif
       }
 
       if (TB1_active==0) {
         dlsch1_harq->status = SCH_IDLE;
         pdlsch1->active     = 0;
-#ifdef DEBUG_HARQ
-        printf("[DCI UE]: TB1 is deactivated, retransmit TB0 transmit in TM6\n");
-#endif
       }
 
-
 #ifdef DEBUG_HARQ
       printf("[DCI UE]: dlsch0_harq status %d , dlsch1_harq status %d\n", dlsch0_harq->status, dlsch1_harq->status);
 #endif
@@ -5727,6 +5751,9 @@ void prepare_dl_decoding_format2_2A(DCI_format_t dci_format,
           dlsch1_harq->rb_alloc_odd[3] = dlsch0_harq->rb_alloc_odd[3];
 
           dlsch1_harq->nb_rb = dlsch0_harq->nb_rb;
+
+          //dlsch0_harq->Nl       = 1;
+          //dlsch1_harq->Nl       = 1;
         }
       } else if ((TB0_active == 0) && (TB1_active == 1)){
 
@@ -5774,38 +5801,57 @@ void prepare_dl_decoding_format2_2A(DCI_format_t dci_format,
         if ((ndi1!=dlsch0_harq->DCINdi) || (dlsch0_harq->first_tx==1))  {
           dlsch0_harq->round = 0;
 
+          //LOG_I(PHY,"[UE] DLSCH: New Data Indicator CW0 subframe %d (pid %d, round %d)\n",
+          //           subframe,harq_pid,dlsch0_harq->round);
           if ( dlsch0_harq->first_tx==1) {
             LOG_D(PHY,"Format 2 DCI First TX0: Clearing flag\n");
             dlsch0_harq->first_tx = 0;
           }
         }else{
          if(dlsch0_harq->round == 0) {
+#if 0
             // skip pdsch decoding and report ack
             dlsch0_harq->status   = SCH_IDLE;
             pdlsch0->active       = 0;
             pdlsch0->harq_ack[subframe].ack = 1;
             pdlsch0->harq_ack[subframe].harq_id = harq_pid;
             pdlsch0->harq_ack[subframe].send_harq_status = 1;
+#endif
          }
         }
 
-          dlsch0_harq->TBS = TBStable[get_I_TBS(dlsch0_harq->mcs)][dlsch0_harq->nb_rb-1];
-          if(dlsch0_harq->Nl == 2)
-            dlsch0_harq->TBS = TBStable[get_I_TBS(dlsch0_harq->mcs)][(dlsch0_harq->nb_rb<<1)-1];
-          if (mcs1 <= 28)
+        // if Imcs in [29..31] TBS is assumed to be as determined from DCI transported in the latest
+        // PDCCH for the same trasport block using Imcs in [0 .. 28]
+        if(dlsch0_harq->mcs <= 28)
+        {
+            dlsch0_harq->TBS = TBStable[get_I_TBS(dlsch0_harq->mcs)][dlsch0_harq->nb_rb-1];
+            LOG_D(PHY,"[UE] DLSCH: New TBS CW0 subframe %d (pid %d, round %d) TBS %d \n",
+                       subframe,harq_pid,dlsch0_harq->round, dlsch0_harq->TBS);
+        }
+        else
+        {
+            LOG_D(PHY,"[UE] DLSCH: Keep the same TBS CW0 subframe %d (pid %d, round %d) TBS %d \n",
+                       subframe,harq_pid,dlsch0_harq->round, dlsch0_harq->TBS);
+        }
+        //if(dlsch0_harq->Nl == 2)
+        //dlsch0_harq->TBS = TBStable[get_I_TBS(dlsch0_harq->mcs)][(dlsch0_harq->nb_rb<<1)-1];
+        if (mcs1 <= 28)
             dlsch0_harq->Qm = get_Qm(mcs1);
-          else if (mcs1<=31)
+        else if (mcs1<=31)
             dlsch0_harq->Qm = (mcs1-28)<<1;
       }
 
       if (TB1_active) {
         if ((ndi2!=dlsch1_harq->DCINdi) || (dlsch1_harq->first_tx==1)) {
           dlsch1_harq->round = 0;
+          //LOG_I(PHY,"[UE] DLSCH: New Data Indicator CW1 subframe %d (pid %d, round %d)\n",
+          //           subframe,harq_pid,dlsch0_harq->round);
           if (dlsch1_harq->first_tx==1) {
             LOG_D(PHY,"Format 2 DCI First TX1: Clearing flag\n");
             dlsch1_harq->first_tx = 0;
           }
         }else{
+#if 0
          if(dlsch1_harq->round == 0) {
             // skip pdsch decoding and report ack
             dlsch1_harq->status   = SCH_IDLE;
@@ -5814,15 +5860,25 @@ void prepare_dl_decoding_format2_2A(DCI_format_t dci_format,
             pdlsch1->harq_ack[subframe].harq_id = harq_pid;
             pdlsch1->harq_ack[subframe].send_harq_status = 1;
          }
+#endif
         }
 
-          dlsch1_harq->TBS = TBStable[get_I_TBS(dlsch1_harq->mcs)][dlsch1_harq->nb_rb-1];
-          if(dlsch0_harq->Nl == 2)
-            dlsch0_harq->TBS = TBStable[get_I_TBS(dlsch0_harq->mcs)][(dlsch0_harq->nb_rb<<1)-1];
-
-          if (mcs2 <= 28)
+        // if Imcs in [29..31] TBS is assumed to be as determined from DCI transported in the latest
+        // PDCCH for the same trasport block using Imcs in [0 .. 28]
+        if(dlsch1_harq->mcs <= 28)
+        {
+            dlsch1_harq->TBS = TBStable[get_I_TBS(dlsch1_harq->mcs)][dlsch1_harq->nb_rb-1];
+            LOG_D(PHY,"[UE] DLSCH: New TBS CW1 subframe %d (pid %d, round %d) TBS %d \n",
+                       subframe,harq_pid,dlsch1_harq->round, dlsch1_harq->TBS);
+        }
+        else
+        {
+            LOG_D(PHY,"[UE] DLSCH: Keep the same TBS CW1 subframe %d (pid %d, round %d) TBS %d \n",
+                       subframe,harq_pid,dlsch1_harq->round, dlsch1_harq->TBS);
+        }
+        if (mcs2 <= 28)
             dlsch1_harq->Qm = get_Qm(mcs2);
-          else if (mcs1<=31)
+        else if (mcs1<=31)
             dlsch1_harq->Qm = (mcs2-28)<<1;
       }
 
@@ -5920,7 +5976,7 @@ int generate_ue_dlsch_params_from_dci(int frame,
                                               tc_rnti,
                                               si_rnti,
                                               ra_rnti,
-                                              p_rnti,
+                                              p_rnti,frame,subframe,
                                               &dci_info_extarcted,
                                               dlsch0_harq);
       if(status == 0)
@@ -6020,7 +6076,7 @@ int generate_ue_dlsch_params_from_dci(int frame,
                                               tc_rnti,
                                               si_rnti,
                                               ra_rnti,
-                                              p_rnti,
+                                              p_rnti,frame,subframe,
                                               &dci_info_extarcted,
                                               dlsch0_harq);
       if(status == 0)
@@ -6047,7 +6103,7 @@ int generate_ue_dlsch_params_from_dci(int frame,
     case format2:
     {
         // extract dci infomation
-        LOG_I(PHY,"[DCI-format2] extract dci infomation \n");
+        //LOG_I(PHY,"[DCI-format2] AbsSubframe %d.%d extract dci infomation \n", frame, subframe);
         extract_dci2_info(frame_parms->N_RB_DL,
                 frame_type,
                 frame_parms->nb_antenna_ports_eNB,
@@ -6069,7 +6125,7 @@ int generate_ue_dlsch_params_from_dci(int frame,
         dlsch0_harq = dlsch0->harq_processes[harq_pid];
         dlsch1_harq = dlsch1->harq_processes[harq_pid];
 
-        LOG_I(PHY,"[DCI-format2] check dci content \n");
+        //LOG_I(PHY,"[DCI-format2] check dci content \n");
         status = check_dci_format2_2a_coherency(format2,
                 frame_parms->N_RB_DL,
                 &dci_info_extarcted,
@@ -6083,10 +6139,11 @@ int generate_ue_dlsch_params_from_dci(int frame,
             return(-1);
 
         // dci is correct ==> update internal structure and prepare dl decoding
-        LOG_I(PHY,"[DCI-format2] update internal structure and prepare dl decoding \n");
+        //LOG_I(PHY,"[DCI-format2] update internal structure and prepare dl decoding \n");
         prepare_dl_decoding_format2_2A(format2,
                 &dci_info_extarcted,
                 frame_parms,
+                rnti,
                 subframe,
                 dlsch0_harq,
                 dlsch1_harq,
@@ -6099,7 +6156,7 @@ int generate_ue_dlsch_params_from_dci(int frame,
     case format2A:
     {
     // extract dci infomation
-    LOG_I(PHY,"[DCI-format2A] extract dci infomation \n");
+    //LOG_I(PHY,"[DCI-format2] AbsSubframe %d.%d extract dci infomation \n", frame%1024, subframe);
     extract_dci2A_info(frame_parms->N_RB_DL,
                        frame_type,
                        frame_parms->nb_antenna_ports_eNB,
@@ -6107,10 +6164,10 @@ int generate_ue_dlsch_params_from_dci(int frame,
                        &dci_info_extarcted);
 
     // check dci content
-    LOG_I(PHY,"[DCI-format2A] check dci content \n");
-    LOG_I(PHY,"[DCI-format2A] tb_swap %d harq_pid %d\n", dci_info_extarcted.tb_swap, dci_info_extarcted.harq_pid);
-      dlsch[0]->active = 0;
-      dlsch[1]->active = 0;
+    //LOG_I(PHY,"[DCI-format2A] check dci content \n");
+    //LOG_I(PHY,"[DCI-format2A] tb_swap %d harq_pid %d\n", dci_info_extarcted.tb_swap, dci_info_extarcted.harq_pid);
+      //dlsch[0]->active = 0;
+      //dlsch[1]->active = 0;
 
     if (dci_info_extarcted.tb_swap == 0) {
       dlsch0 = dlsch[0];
@@ -6122,7 +6179,7 @@ int generate_ue_dlsch_params_from_dci(int frame,
     dlsch0_harq = dlsch0->harq_processes[dci_info_extarcted.harq_pid];
     dlsch1_harq = dlsch1->harq_processes[dci_info_extarcted.harq_pid];
 
-    LOG_I(PHY,"[DCI-format2A] check dci content \n");
+    //LOG_I(PHY,"[DCI-format2A] check dci content \n");
     status = check_dci_format2_2a_coherency(format2A,
                                               frame_parms->N_RB_DL,
                                               &dci_info_extarcted,
@@ -6136,10 +6193,11 @@ int generate_ue_dlsch_params_from_dci(int frame,
       return(-1);
 
     // dci is correct ==> update internal structure and prepare dl decoding
-    LOG_I(PHY,"[DCI-format2A] update internal structure and prepare dl decoding \n");
+    //LOG_I(PHY,"[DCI-format2A] update internal structure and prepare dl decoding \n");
     prepare_dl_decoding_format2_2A(format2A,
                                    &dci_info_extarcted,
                                    frame_parms,
+                                   rnti,
                                    subframe,
                                    dlsch0_harq,
                                    dlsch1_harq,
@@ -7142,7 +7200,7 @@ int generate_ue_ulsch_params_from_dci(void *dci_pdu,
   uint8_t transmission_mode = ue->transmission_mode[eNB_id];
   ANFBmode_t AckNackFBMode;
   LTE_UE_ULSCH_t *ulsch = ue->ulsch[eNB_id];
-  LTE_UE_DLSCH_t **dlsch = ue->dlsch[0];
+  LTE_UE_DLSCH_t **dlsch = ue->dlsch[subframe&0x1][0];
   PHY_MEASUREMENTS *meas = &ue->measurements;
   LTE_DL_FRAME_PARMS *frame_parms = &ue->frame_parms;
   //  uint32_t current_dlsch_cqi = ue->current_dlsch_cqi[eNB_id];
@@ -7947,7 +8005,7 @@ int generate_ue_ulsch_params_from_dci(void *dci_pdu,
     if (frame_parms->frame_type == FDD) {
       int dl_subframe = (subframe<4) ? (subframe+6) : (subframe-4);
 
-      if (ue->dlsch[eNB_id][0]->harq_ack[dl_subframe].send_harq_status>0) { // we have downlink transmission
+      if (ue->dlsch[dl_subframe&0x1][eNB_id][0]->harq_ack[dl_subframe].send_harq_status>0) { // we have downlink transmission
         ulsch->harq_processes[harq_pid]->O_ACK = 1;
       } else {
         ulsch->harq_processes[harq_pid]->O_ACK = 0;
diff --git a/openair1/PHY/LTE_TRANSPORT/defs.h b/openair1/PHY/LTE_TRANSPORT/defs.h
index 3b13c01c52ccf0b2b02e69772aa38ec61487d3e5..f6293bf1b2d73689d6769a1270bfe0ad53fc418b 100755
--- a/openair1/PHY/LTE_TRANSPORT/defs.h
+++ b/openair1/PHY/LTE_TRANSPORT/defs.h
@@ -708,7 +708,7 @@ typedef struct {
   int16_t sqrt_rho_a;
   /// amplitude of PDSCH (compared to RS) in symbols containing pilots
   int16_t sqrt_rho_b;
-  /// Current HARQ process id
+  /// Current HARQ process id threadRx Odd and threadRx Even
   uint8_t current_harq_pid;
   /// Current subband antenna selection
   uint32_t antenna_alloc;
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c b/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
index 04794e3ffcba4fcf6540f31cffe594416774b23f..d683decc9f57230760206b91795621eaa585de8e 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
@@ -38,6 +38,7 @@
 #include "SIMULATION/TOOLS/defs.h"
 //#define DEBUG_DLSCH_DECODING
 
+extern double cpuf;
 
 void free_ue_dlsch(LTE_UE_DLSCH_t *dlsch)
 {
@@ -270,7 +271,7 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
     return(max_turbo_iterations);
     }*/
 
-  /*harq_pid = dlsch->current_harq_pid;
+  /*harq_pid = dlsch->current_harq_pid[subframe&0x1];
   if (harq_pid >= 8) {
     printf("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
     return(max_turbo_iterations);
@@ -341,6 +342,8 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
   printf("Segmentation: C %d, Cminus %d, Kminus %d, Kplus %d\n",harq_process->C,harq_process->Cminus,harq_process->Kminus,harq_process->Kplus);
 #endif
 
+  opp_enabled=1;
+
   for (r=0; r<harq_process->C; r++) {
 
 
@@ -376,7 +379,7 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
                                             (r==0) ? harq_process->F : 0);
 
 #ifdef DEBUG_DLSCH_DECODING
-    LOG_I(PHY,"HARQ_PID %d Rate Matching Segment %d (coded bits %d,unpunctured/repeated bits %d, TBS %d, mod_order %d, nb_rb %d, Nl %d, rv %d, round %d)...\n",
+    LOG_D(PHY,"HARQ_PID %d Rate Matching Segment %d (coded bits %d,unpunctured/repeated bits %d, TBS %d, mod_order %d, nb_rb %d, Nl %d, rv %d, round %d)...\n",
           harq_pid,r, G,
           Kr*3,
           harq_process->TBS,
@@ -459,6 +462,10 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
 #if 1
     if (err_flag == 0) {
 
+        LOG_D(PHY, "turbo algo Kr=%d cb_cnt=%d C=%d nbRB=%d TBSInput=%d TBSHarq=%d TBSplus24=%d mcs=%d Qm=%d RIV=%d round=%d maxIter %d\n",
+                            Kr,r,harq_process->C,harq_process->nb_rb,A,harq_process->TBS,
+                            harq_process->B,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round,dlsch->max_turbo_iterations);
+
     	if (llr8_flag) {
     		AssertFatal (Kr >= 256, "turbo algo issue Kr=%d cb_cnt=%d C=%d nbRB=%d TBSInput=%d TBSHarq=%d TBSplus24=%d mcs=%d Qm=%d RIV=%d round=%d\n",
     				Kr,r,harq_process->C,harq_process->nb_rb,A,harq_process->TBS,harq_process->B,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round);
@@ -605,6 +612,13 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
              &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
 	  stop_meas(dlsch_turbo_decoding_stats);
 
+	  /*printf("Segmentation: C %d r %d, dlsch_rate_unmatching_stats %5.3f dlsch_deinterleaving_stats %5.3f  dlsch_turbo_decoding_stats %5.3f \n",
+              harq_process->C,
+              r,
+              dlsch_rate_unmatching_stats->p_time/(cpuf*1000.0),
+              dlsch_deinterleaving_stats->p_time/(cpuf*1000.0),
+              dlsch_turbo_decoding_stats->p_time/(cpuf*1000.0));*/
+
 	}
       }
     }
@@ -636,25 +650,28 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
     harq_process->round++;
 
 
-    if(is_crnti)
-    {
-    LOG_D(PHY,"[UE %d] DLSCH: Setting NACK for subframe %d (pid %d, round %d, TBS %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round,harq_process->TBS);
-    }
     //    printf("Rate: [UE %d] DLSCH: Setting NACK for subframe %d (pid %d, round %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round);
     if (harq_process->round >= dlsch->Mdlharq) {
       harq_process->status = SCH_IDLE;
+      harq_process->round  = 0;
+    }
+    if(is_crnti)
+    {
+    LOG_D(PHY,"[UE %d] DLSCH: Setting NACK for subframe %d (pid %d, pid status %d, round %d/Max %d, TBS %d)\n",
+               phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->status,harq_process->round,dlsch->Mdlharq,harq_process->TBS);
     }
 
     return((1+dlsch->max_turbo_iterations));
   } else {
-    LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for SFN/SF %d/%d (pid %d, round %d, subframe %d)\n",
-        phy_vars_ue->Mod_id, frame_rx_prev, subframe_rx_prev, harq_pid, harq_process->round, subframe);
 
     harq_process->status = SCH_IDLE;
     harq_process->round  = 0;
     dlsch->harq_ack[subframe].ack = 1;
     dlsch->harq_ack[subframe].harq_id = harq_pid;
     dlsch->harq_ack[subframe].send_harq_status = 1;
+    LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for SFN/SF %d/%d (pid %d, pid status %d, round %d, subframe %d)\n",
+        phy_vars_ue->Mod_id, frame_rx_prev, subframe_rx_prev, harq_pid, harq_process->status, harq_process->round, subframe);
+
     if(is_crnti)
     {
     LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for subframe %d (pid %d, round %d, TBS %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round,harq_process->TBS);
@@ -908,7 +925,7 @@ uint32_t dlsch_decoding_emul(PHY_VARS_UE *phy_vars_ue,
     break;
 
   case PDSCH: // TB0
-    dlsch_ue  = phy_vars_ue->dlsch[eNB_id][0];
+    dlsch_ue  = phy_vars_ue->dlsch[subframe&0x1][eNB_id][0];
     harq_pid = dlsch_ue->current_harq_pid;
     ue_id= (uint32_t)find_ue((int16_t)phy_vars_ue->pdcch_vars[(uint32_t)eNB_id]->crnti,PHY_vars_eNB_g[eNB_id2][CC_id]);
     DevAssert( ue_id != (uint32_t)-1 );
@@ -954,7 +971,7 @@ uint32_t dlsch_decoding_emul(PHY_VARS_UE *phy_vars_ue,
     break;
 
   case PDSCH1: { // TB1
-    dlsch_ue = phy_vars_ue->dlsch[eNB_id][1];
+    dlsch_ue = phy_vars_ue->dlsch[subframe&0x1][eNB_id][1];
     harq_pid = dlsch_ue->current_harq_pid;
     int8_t UE_id = find_ue( phy_vars_ue->pdcch_vars[eNB_id]->crnti, PHY_vars_eNB_g[eNB_id2][CC_id] );
     DevAssert( UE_id != -1 );
@@ -1008,7 +1025,7 @@ uint32_t dlsch_decoding_emul(PHY_VARS_UE *phy_vars_ue,
     break;
 
   default:
-    dlsch_ue = phy_vars_ue->dlsch[eNB_id][0];
+    dlsch_ue = phy_vars_ue->dlsch[subframe&0x1][eNB_id][0];
     LOG_E(PHY,"dlsch_decoding_emul: FATAL, unknown DLSCH_id %d\n",dlsch_id);
     dlsch_ue->last_iteration_cnt = 1+dlsch_ue->max_turbo_iterations;
     return(1+dlsch_ue->max_turbo_iterations);
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
index 601eacc6205143468931e31b6526a58dfba962c9..72b486c04202891f9d600fad11a185d918dfcb43 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
@@ -42,6 +42,7 @@
 #define NOCYGWIN_STATIC
 #endif
 
+extern int16_t dlsch_demod_shift;
 //#define DEBUG_HARQ
 
 //#undef LOG_D
@@ -138,16 +139,18 @@ int rx_pdsch(PHY_VARS_UE *ue,
 
   case PDSCH:
     pdsch_vars = &ue->pdsch_vars[subframe&0x1][eNB_id];
-    dlsch = ue->dlsch[eNB_id];
+    dlsch = ue->dlsch[subframe&0x1][eNB_id];
+    LOG_D(PHY,"AbsSubframe %d.%d / Sym %d harq_pid %d,  harq status %d.%d \n",
+                   frame,subframe,symbol,harq_pid,
+                   dlsch[0]->harq_processes[harq_pid]->status,
+                   dlsch[1]->harq_processes[harq_pid]->status);
+
     if ((dlsch[0]->harq_processes[harq_pid]->status == ACTIVE) &&
         (dlsch[1]->harq_processes[harq_pid]->status == ACTIVE)){
       codeword_TB0 = dlsch[0]->harq_processes[harq_pid]->codeword;
       codeword_TB1 = dlsch[1]->harq_processes[harq_pid]->codeword;
       dlsch0_harq = dlsch[codeword_TB0]->harq_processes[harq_pid];
       dlsch1_harq = dlsch[codeword_TB1]->harq_processes[harq_pid];
-#ifdef DEBUG_HARQ
-      printf("I am assuming both CW active\n");
-#endif
     }
      else if ((dlsch[0]->harq_processes[harq_pid]->status == ACTIVE) &&
               (dlsch[1]->harq_processes[harq_pid]->status != ACTIVE) ) {
@@ -159,8 +162,8 @@ int rx_pdsch(PHY_VARS_UE *ue,
      else if ((dlsch[0]->harq_processes[harq_pid]->status != ACTIVE) &&
               (dlsch[1]->harq_processes[harq_pid]->status == ACTIVE) ){
       codeword_TB1 = dlsch[1]->harq_processes[harq_pid]->codeword;
-      dlsch0_harq = dlsch[1]->harq_processes[harq_pid];
-      dlsch1_harq = NULL;
+      dlsch0_harq  = dlsch[1]->harq_processes[harq_pid];
+      dlsch1_harq  = NULL;
       codeword_TB0 = -1;
     }
     else {
@@ -336,7 +339,7 @@ int rx_pdsch(PHY_VARS_UE *ue,
     LOG_W(PHY,"dlsch_demodulation: beamforming mode not supported yet.\n");
   }
 
-  //  printf("nb_rb = %d, eNB_id %d\n",nb_rb,eNB_id);
+  //printf("nb_rb = %d, eNB_id %d\n",nb_rb,eNB_id);
   if (nb_rb==0) {
     LOG_D(PHY,"dlsch_demodulation.c: nb_rb=0\n");
     return(-1);
@@ -357,12 +360,18 @@ int rx_pdsch(PHY_VARS_UE *ue,
                       symbol,
                       nb_rb);
 
-  if ((dlsch0_harq->mimo_mode<DUALSTREAM_UNIFORM_PRECODING1) && (rx_type==rx_IC_single_stream) && (eNB_id_i==ue->n_connected_eNB) && (dlsch0_harq->dl_power_off==0))  // TM5 two-user
+  if ((dlsch0_harq->mimo_mode<DUALSTREAM_UNIFORM_PRECODING1) &&
+      (rx_type==rx_IC_single_stream) &&
+      (eNB_id_i==ue->n_connected_eNB) &&
+      (dlsch0_harq->dl_power_off==0)
+     )  // TM5 two-user
+  {
     dlsch_scale_channel(pdsch_vars[eNB_id_i]->dl_ch_estimates_ext,
                         frame_parms,
                         dlsch,
                         symbol,
                         nb_rb);
+  }
 
   if (first_symbol_flag==1) {
     if (beamforming_mode==0){
@@ -392,21 +401,23 @@ int rx_pdsch(PHY_VARS_UE *ue,
                                  nb_rb,
                                  dlsch0_harq->mimo_mode);
 
+      LOG_D(PHY,"Channel Level TM34  avg_0 %d, avg_1 %d, rx_type %d, rx_standard %d, interf_unaw_shift %d \n", avg_0[0],
+              avg_1[0], rx_type, rx_standard, interf_unaw_shift);
         if (rx_type>rx_standard) {
-          avg_0[0] = (log2_approx(avg_0[0])/2) -13 + interf_unaw_shift;
-          avg_1[0] = (log2_approx(avg_1[0])/2) -13 + interf_unaw_shift;
+          avg_0[0] = (log2_approx(avg_0[0])/2) + dlsch_demod_shift;// + 2 ;//+ 4;
+          avg_1[0] = (log2_approx(avg_1[0])/2) + dlsch_demod_shift;// + 2 ;//+ 4;
           pdsch_vars[eNB_id]->log2_maxh0 = cmax(avg_0[0],0);
           pdsch_vars[eNB_id]->log2_maxh1 = cmax(avg_1[0],0);
-          //printf("TM4 I-A log2_maxh0 = %d\n", lte_ue_pdsch_vars[eNB_id]->log2_maxh0);
-          //printf("TM4 I-A log2_maxh1 = %d\n", lte_ue_pdsch_vars[eNB_id]->log2_maxh1);
+          //printf("TM4 I-A log2_maxh0 = %d\n", pdsch_vars[eNB_id]->log2_maxh0);
+          //printf("TM4 I-A log2_maxh1 = %d\n", pdsch_vars[eNB_id]->log2_maxh1);
          }
           else {
           avg_0[0] = (log2_approx(avg_0[0])/2) - 13 + interf_unaw_shift;
           avg_1[0] = (log2_approx(avg_1[0])/2) - 13 + interf_unaw_shift;
           pdsch_vars[eNB_id]->log2_maxh0 = cmax(avg_0[0],0);
           pdsch_vars[eNB_id]->log2_maxh1 = cmax(avg_1[0],0);
-          //printf("TM4 I-UA log2_maxh0 = %d\n", lte_ue_pdsch_vars[eNB_id]->log2_maxh0);
-          //printf("TM4 I-UA log2_maxh1 = %d\n", lte_ue_pdsch_vars[eNB_id]->log2_maxh1);
+          //printf("TM4 I-UA log2_maxh0 = %d\n", pdsch_vars[eNB_id]->log2_maxh0);
+          //printf("TM4 I-UA log2_maxh1 = %d\n", pdsch_vars[eNB_id]->log2_maxh1);
         }
       }
       else if (dlsch0_harq->mimo_mode<DUALSTREAM_UNIFORM_PRECODING1) {// single-layer precoding (TM5, TM6)
@@ -448,8 +459,11 @@ int rx_pdsch(PHY_VARS_UE *ue,
                               symbol,
                               nb_rb);
 #ifdef DEBUG_PHY
-    LOG_D(PHY,"[DLSCH] log2_maxh = %d (%d,%d)\n",pdsch_vars[eNB_id]->log2_maxh,avg[0],avgs);
-    LOG_D(PHY,"[DLSCH] mimo_mode = %d\n", dlsch0_harq->mimo_mode);
+    LOG_I(PHY,"[DLSCH] log2_maxh = %d [log2_maxh0 %d log2_maxh1 %d] (%d,%d)\n",pdsch_vars[eNB_id]->log2_maxh,
+                                                 pdsch_vars[eNB_id]->log2_maxh0,
+                                                 pdsch_vars[eNB_id]->log2_maxh1,
+                                                 avg[0],avgs);
+    LOG_I(PHY,"[DLSCH] mimo_mode = %d\n", dlsch0_harq->mimo_mode);
 #endif
   }
 
@@ -1023,6 +1037,44 @@ int rx_pdsch(PHY_VARS_UE *ue,
   }
   }
 
+// Please keep it: useful for debugging
+#if 0
+  if( (symbol == 13) && (dlsch0_harq->mimo_mode == 2) )
+  {
+      LOG_E(PHY,"Dump Phy Chan Est \n");
+      if(subframe&0x1)
+      {
+#if 1
+      //write_output("rxdataF0.m"    , "rxdataF0",             &common_vars->common_vars_rx_data_per_thread[subframe&0x1].rxdataF[0][0],14*frame_parms->ofdm_symbol_size,1,1);
+      //write_output("rxdataF1.m"    , "rxdataF1",             &common_vars->common_vars_rx_data_per_thread[subframe&0x1].rxdataF[0][0],14*frame_parms->ofdm_symbol_size,1,1);
+      //write_output("dl_ch_estimates00.m", "dl_ch_estimates00",   &common_vars->common_vars_rx_data_per_thread[subframe&0x1].dl_ch_estimates[eNB_id][0][0],14*frame_parms->ofdm_symbol_size,1,1);
+      //write_output("dl_ch_estimates01.m", "dl_ch_estimates01",   &common_vars->common_vars_rx_data_per_thread[subframe&0x1].dl_ch_estimates[eNB_id][1][0],14*frame_parms->ofdm_symbol_size,1,1);
+      //write_output("dl_ch_estimates10.m", "dl_ch_estimates10",   &common_vars->common_vars_rx_data_per_thread[subframe&0x1].dl_ch_estimates[eNB_id][2][0],14*frame_parms->ofdm_symbol_size,1,1);
+      //write_output("dl_ch_estimates11.m", "dl_ch_estimates11",   &common_vars->common_vars_rx_data_per_thread[subframe&0x1].dl_ch_estimates[eNB_id][3][0],14*frame_parms->ofdm_symbol_size,1,1);
+
+
+      //write_output("rxdataF_ext00.m"    , "rxdataF_ext00",       &pdsch_vars[eNB_id]->rxdataF_ext[0][0],14*frame_parms->N_RB_DL*12,1,1);
+      //write_output("rxdataF_ext01.m"    , "rxdataF_ext01",       &pdsch_vars[eNB_id]->rxdataF_ext[1][0],14*frame_parms->N_RB_DL*12,1,1);
+      //write_output("rxdataF_ext10.m"    , "rxdataF_ext10",       &pdsch_vars[eNB_id]->rxdataF_ext[2][0],14*frame_parms->N_RB_DL*12,1,1);
+      //write_output("rxdataF_ext11.m"    , "rxdataF_ext11",       &pdsch_vars[eNB_id]->rxdataF_ext[3][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("dl_ch_estimates_ext00.m", "dl_ch_estimates_ext00", &pdsch_vars[eNB_id]->dl_ch_estimates_ext[0][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("dl_ch_estimates_ext01.m", "dl_ch_estimates_ext01", &pdsch_vars[eNB_id]->dl_ch_estimates_ext[1][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("dl_ch_estimates_ext10.m", "dl_ch_estimates_ext10", &pdsch_vars[eNB_id]->dl_ch_estimates_ext[2][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("dl_ch_estimates_ext11.m", "dl_ch_estimates_ext11", &pdsch_vars[eNB_id]->dl_ch_estimates_ext[3][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("rxdataF_comp00.m","rxdataF_comp00",              &pdsch_vars[eNB_id]->rxdataF_comp0[0][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("rxdataF_comp01.m","rxdataF_comp01",              &pdsch_vars[eNB_id]->rxdataF_comp0[1][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("rxdataF_comp10.m","rxdataF_comp10",              &pdsch_vars[eNB_id]->rxdataF_comp1[harq_pid][round][0][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("rxdataF_comp11.m","rxdataF_comp11",              &pdsch_vars[eNB_id]->rxdataF_comp1[harq_pid][round][1][0],14*frame_parms->N_RB_DL*12,1,1);
+#endif
+      write_output("llr0.m","llr0",  &pdsch_vars[eNB_id]->llr[0][0],(14*nb_rb*12*dlsch1_harq->Qm) - 4*(nb_rb*4*dlsch1_harq->Qm),1,0);
+      write_output("llr1.m","llr1",  &pdsch_vars[eNB_id]->llr[1][0],(14*nb_rb*12*dlsch1_harq->Qm) - 4*(nb_rb*4*dlsch1_harq->Qm),1,0);
+
+
+      AssertFatal(0," ");
+      }
+
+  }
+#endif
 
 #if T_TRACER
   T(T_UE_PHY_PDSCH_IQ, T_INT(eNB_id), T_INT(ue->Mod_id), T_INT(frame%1024),
@@ -1609,9 +1661,7 @@ void prec2A_TM3_128(__m128i *ch0,__m128i *ch1) {
 
   __m128i tmp0,tmp1;
 
-  // sqrt(2) is already taken into account in computation sqrt_rho_a, sqrt_rho_b,
-  //so divide by 2 is replaced by divide by sqrt(2).
-
+  //_mm_mulhi_epi16
   //  print_shorts("prec2A_TM3 ch0 (before):",ch0);
   //  print_shorts("prec2A_TM3 ch1 (before):",ch1);
 
@@ -1622,17 +1672,22 @@ void prec2A_TM3_128(__m128i *ch0,__m128i *ch1) {
   ch0[0] = _mm_adds_epi16(ch0[0],tmp1);
   ch1[0] = _mm_subs_epi16(tmp0,tmp1);
 
-
-  //  print_shorts("prec2A_TM3 ch0 (mid):",&tmp0);
-  //  print_shorts("prec2A_TM3 ch1 (mid):",ch1);
-
   ch0[0] = _mm_mulhi_epi16(ch0[0],amp);
   ch0[0] = _mm_slli_epi16(ch0[0],1);
+
   ch1[0] = _mm_mulhi_epi16(ch1[0],amp);
   ch1[0] = _mm_slli_epi16(ch1[0],1);
 
-  // ch0[0] = _mm_srai_epi16(ch0[0],1);
-  // ch1[0] = _mm_srai_epi16(ch1[0],1);
+  //  print_shorts("prec2A_TM3 ch0 (mid):",&tmp0);
+  //  print_shorts("prec2A_TM3 ch1 (mid):",ch1);
+
+  //ch0[0] = _mm_mulhi_epi16(ch0[0],amp);
+  //ch0[0] = _mm_slli_epi16(ch0[0],1);
+  //ch1[0] = _mm_mulhi_epi16(ch1[0],amp);
+  //ch1[0] = _mm_slli_epi16(ch1[0],1);
+
+  //ch0[0] = _mm_srai_epi16(ch0[0],1);
+  //ch1[0] = _mm_srai_epi16(ch1[0],1);
 
   //  print_shorts("prec2A_TM3 ch0 (after):",ch0);
   //  print_shorts("prec2A_TM3 ch1 (after):",ch1);
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
index 0b73e2b02390a060e3774838e42fa4ab7cfa1533..b08836762cdbbf4111cc9f32b1460c891718d752 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
@@ -8831,6 +8831,37 @@ int dlsch_64qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
     len = (nb_rb*12) - pbch_pss_sss_adjust;
   }
 
+#ifdef __AVX2__
+
+  // Round length up to multiple of 16 words
+  uint32_t len256i = ((len+16)>>4)*16;
+  int32_t *rxF_256i      = (int32_t*) malloc16_clear(len256i*4);
+  int32_t *rxF_i_256i    = (int32_t*) malloc16_clear(len256i*4);
+  int32_t *ch_mag_256i   = (int32_t*) malloc16_clear(len256i*4);
+  int32_t *ch_mag_i_256i = (int32_t*) malloc16_clear(len256i*4);
+  int32_t *rho_256i      = (int32_t*) malloc16_clear(len256i*4);
+
+  memcpy(rxF_256i, rxF, len*4);
+  memcpy(rxF_i_256i, rxF_i, len*4);
+  memcpy(ch_mag_256i, ch_mag, len*4);
+  memcpy(ch_mag_i_256i, ch_mag_i, len*4);
+  memcpy(rho_256i, rho, len*4);
+
+  qam64_qam64_avx2((int32_t *)rxF_256i,
+                   (int32_t *)rxF_i_256i,
+                   (int32_t *)ch_mag_256i,
+                   (int32_t *)ch_mag_i_256i,
+                   (int16_t *)llr16,
+                   (int32_t *) rho_256i,
+                   len);
+
+  free16(rxF_256i, sizeof(rxF_256i));
+  free16(rxF_i_256i, sizeof(rxF_i_256i));
+  free16(ch_mag_256i, sizeof(ch_mag_256i));
+  free16(ch_mag_i_256i, sizeof(ch_mag_i_256i));
+  free16(rho_256i, sizeof(rho_256i));
+
+#else
   qam64_qam64((short *)rxF,
               (short *)rxF_i,
               (short *)ch_mag,
@@ -8838,6 +8869,7 @@ int dlsch_64qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
               (short *)llr16,
               (short *)rho,
               len);
+#endif
 
   llr16 += (6*len);
   *llr16p = (short *)llr16;
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation_avx2.c b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation_avx2.c
new file mode 100644
index 0000000000000000000000000000000000000000..cda5ad0f55ee8afbe37bf4d3536cea17fb3ed575
--- /dev/null
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation_avx2.c
@@ -0,0 +1,4034 @@
+ /*
+ * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The OpenAirInterface Software Alliance licenses this file to You under
+ * the OAI Public License, Version 1.0  (the "License"); you may not use this file
+ * except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.openairinterface.org/?page_id=698
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *-------------------------------------------------------------------------------
+ * For more information about the OpenAirInterface (OAI) Software Alliance:
+ *      contact@openairinterface.org
+ */
+
+/*! \file PHY/LTE_TRANSPORT/dlsch_llr_computation.c
+ * \brief Top-level routines for LLR computation of the PDSCH physical channel from 36-211, V8.6 2009-03
+ * \author R. Knopp, F. Kaltenberger,A. Bhamri, S. Aubert, S. Wagner, X Jiang
+ * \date 2011
+ * \version 0.1
+ * \company Eurecom
+ * \email: knopp@eurecom.fr,florian.kaltenberger@eurecom.fr,ankit.bhamri@eurecom.fr,sebastien.aubert@eurecom.fr, sebastian.wagner@eurecom.fr
+ * \note
+ * \warning
+ */
+
+#include "PHY/defs.h"
+#include "PHY/TOOLS/defs.h"
+#include "PHY/extern.h"
+#include "defs.h"
+#include "extern.h"
+#include "PHY/sse_intrin.h"
+
+int16_t ones256[16] __attribute__ ((aligned(32))) = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};
+
+static __m256i rho_rpi __attribute__ ((aligned(32)));
+static __m256i rho_rmi __attribute__ ((aligned(32)));
+static __m256i rho_rpi_1_1 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_1_3 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_1_5 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_1_7 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_3_1 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_3_3 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_3_5 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_3_7 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_5_1 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_5_3 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_5_5 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_5_7 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_7_1 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_7_3 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_7_5 __attribute__ ((aligned(32)));
+static __m256i rho_rpi_7_7 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_1_1 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_1_3 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_1_5 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_1_7 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_3_1 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_3_3 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_3_5 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_3_7 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_5_1 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_5_3 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_5_5 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_5_7 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_7_1 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_7_3 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_7_5 __attribute__ ((aligned(32)));
+static __m256i rho_rmi_7_7 __attribute__ ((aligned(32)));
+
+static __m256i psi_r_m7_m7 __attribute__ ((aligned(32)));
+static __m256i psi_r_m7_m5 __attribute__ ((aligned(32)));
+static __m256i psi_r_m7_m3 __attribute__ ((aligned(32)));
+static __m256i psi_r_m7_m1 __attribute__ ((aligned(32)));
+static __m256i psi_r_m7_p1 __attribute__ ((aligned(32)));
+static __m256i psi_r_m7_p3 __attribute__ ((aligned(32)));
+static __m256i psi_r_m7_p5 __attribute__ ((aligned(32)));
+static __m256i psi_r_m7_p7 __attribute__ ((aligned(32)));
+static __m256i psi_r_m5_m7 __attribute__ ((aligned(32)));
+static __m256i psi_r_m5_m5 __attribute__ ((aligned(32)));
+static __m256i psi_r_m5_m3 __attribute__ ((aligned(32)));
+static __m256i psi_r_m5_m1 __attribute__ ((aligned(32)));
+static __m256i psi_r_m5_p1 __attribute__ ((aligned(32)));
+static __m256i psi_r_m5_p3 __attribute__ ((aligned(32)));
+static __m256i psi_r_m5_p5 __attribute__ ((aligned(32)));
+static __m256i psi_r_m5_p7 __attribute__ ((aligned(32)));
+static __m256i psi_r_m3_m7 __attribute__ ((aligned(32)));
+static __m256i psi_r_m3_m5 __attribute__ ((aligned(32)));
+static __m256i psi_r_m3_m3 __attribute__ ((aligned(32)));
+static __m256i psi_r_m3_m1 __attribute__ ((aligned(32)));
+static __m256i psi_r_m3_p1 __attribute__ ((aligned(32)));
+static __m256i psi_r_m3_p3 __attribute__ ((aligned(32)));
+static __m256i psi_r_m3_p5 __attribute__ ((aligned(32)));
+static __m256i psi_r_m3_p7 __attribute__ ((aligned(32)));
+static __m256i psi_r_m1_m7 __attribute__ ((aligned(32)));
+static __m256i psi_r_m1_m5 __attribute__ ((aligned(32)));
+static __m256i psi_r_m1_m3 __attribute__ ((aligned(32)));
+static __m256i psi_r_m1_m1 __attribute__ ((aligned(32)));
+static __m256i psi_r_m1_p1 __attribute__ ((aligned(32)));
+static __m256i psi_r_m1_p3 __attribute__ ((aligned(32)));
+static __m256i psi_r_m1_p5 __attribute__ ((aligned(32)));
+static __m256i psi_r_m1_p7 __attribute__ ((aligned(32)));
+static __m256i psi_r_p1_m7 __attribute__ ((aligned(32)));
+static __m256i psi_r_p1_m5 __attribute__ ((aligned(32)));
+static __m256i psi_r_p1_m3 __attribute__ ((aligned(32)));
+static __m256i psi_r_p1_m1 __attribute__ ((aligned(32)));
+static __m256i psi_r_p1_p1 __attribute__ ((aligned(32)));
+static __m256i psi_r_p1_p3 __attribute__ ((aligned(32)));
+static __m256i psi_r_p1_p5 __attribute__ ((aligned(32)));
+static __m256i psi_r_p1_p7 __attribute__ ((aligned(32)));
+static __m256i psi_r_p3_m7 __attribute__ ((aligned(32)));
+static __m256i psi_r_p3_m5 __attribute__ ((aligned(32)));
+static __m256i psi_r_p3_m3 __attribute__ ((aligned(32)));
+static __m256i psi_r_p3_m1 __attribute__ ((aligned(32)));
+static __m256i psi_r_p3_p1 __attribute__ ((aligned(32)));
+static __m256i psi_r_p3_p3 __attribute__ ((aligned(32)));
+static __m256i psi_r_p3_p5 __attribute__ ((aligned(32)));
+static __m256i psi_r_p3_p7 __attribute__ ((aligned(32)));
+static __m256i psi_r_p5_m7 __attribute__ ((aligned(32)));
+static __m256i psi_r_p5_m5 __attribute__ ((aligned(32)));
+static __m256i psi_r_p5_m3 __attribute__ ((aligned(32)));
+static __m256i psi_r_p5_m1 __attribute__ ((aligned(32)));
+static __m256i psi_r_p5_p1 __attribute__ ((aligned(32)));
+static __m256i psi_r_p5_p3 __attribute__ ((aligned(32)));
+static __m256i psi_r_p5_p5 __attribute__ ((aligned(32)));
+static __m256i psi_r_p5_p7 __attribute__ ((aligned(32)));
+static __m256i psi_r_p7_m7 __attribute__ ((aligned(32)));
+static __m256i psi_r_p7_m5 __attribute__ ((aligned(32)));
+static __m256i psi_r_p7_m3 __attribute__ ((aligned(32)));
+static __m256i psi_r_p7_m1 __attribute__ ((aligned(32)));
+static __m256i psi_r_p7_p1 __attribute__ ((aligned(32)));
+static __m256i psi_r_p7_p3 __attribute__ ((aligned(32)));
+static __m256i psi_r_p7_p5 __attribute__ ((aligned(32)));
+static __m256i psi_r_p7_p7 __attribute__ ((aligned(32)));
+
+static __m256i psi_i_m7_m7 __attribute__ ((aligned(32)));
+static __m256i psi_i_m7_m5 __attribute__ ((aligned(32)));
+static __m256i psi_i_m7_m3 __attribute__ ((aligned(32)));
+static __m256i psi_i_m7_m1 __attribute__ ((aligned(32)));
+static __m256i psi_i_m7_p1 __attribute__ ((aligned(32)));
+static __m256i psi_i_m7_p3 __attribute__ ((aligned(32)));
+static __m256i psi_i_m7_p5 __attribute__ ((aligned(32)));
+static __m256i psi_i_m7_p7 __attribute__ ((aligned(32)));
+static __m256i psi_i_m5_m7 __attribute__ ((aligned(32)));
+static __m256i psi_i_m5_m5 __attribute__ ((aligned(32)));
+static __m256i psi_i_m5_m3 __attribute__ ((aligned(32)));
+static __m256i psi_i_m5_m1 __attribute__ ((aligned(32)));
+static __m256i psi_i_m5_p1 __attribute__ ((aligned(32)));
+static __m256i psi_i_m5_p3 __attribute__ ((aligned(32)));
+static __m256i psi_i_m5_p5 __attribute__ ((aligned(32)));
+static __m256i psi_i_m5_p7 __attribute__ ((aligned(32)));
+static __m256i psi_i_m3_m7 __attribute__ ((aligned(32)));
+static __m256i psi_i_m3_m5 __attribute__ ((aligned(32)));
+static __m256i psi_i_m3_m3 __attribute__ ((aligned(32)));
+static __m256i psi_i_m3_m1 __attribute__ ((aligned(32)));
+static __m256i psi_i_m3_p1 __attribute__ ((aligned(32)));
+static __m256i psi_i_m3_p3 __attribute__ ((aligned(32)));
+static __m256i psi_i_m3_p5 __attribute__ ((aligned(32)));
+static __m256i psi_i_m3_p7 __attribute__ ((aligned(32)));
+static __m256i psi_i_m1_m7 __attribute__ ((aligned(32)));
+static __m256i psi_i_m1_m5 __attribute__ ((aligned(32)));
+static __m256i psi_i_m1_m3 __attribute__ ((aligned(32)));
+static __m256i psi_i_m1_m1 __attribute__ ((aligned(32)));
+static __m256i psi_i_m1_p1 __attribute__ ((aligned(32)));
+static __m256i psi_i_m1_p3 __attribute__ ((aligned(32)));
+static __m256i psi_i_m1_p5 __attribute__ ((aligned(32)));
+static __m256i psi_i_m1_p7 __attribute__ ((aligned(32)));
+static __m256i psi_i_p1_m7 __attribute__ ((aligned(32)));
+static __m256i psi_i_p1_m5 __attribute__ ((aligned(32)));
+static __m256i psi_i_p1_m3 __attribute__ ((aligned(32)));
+static __m256i psi_i_p1_m1 __attribute__ ((aligned(32)));
+static __m256i psi_i_p1_p1 __attribute__ ((aligned(32)));
+static __m256i psi_i_p1_p3 __attribute__ ((aligned(32)));
+static __m256i psi_i_p1_p5 __attribute__ ((aligned(32)));
+static __m256i psi_i_p1_p7 __attribute__ ((aligned(32)));
+static __m256i psi_i_p3_m7 __attribute__ ((aligned(32)));
+static __m256i psi_i_p3_m5 __attribute__ ((aligned(32)));
+static __m256i psi_i_p3_m3 __attribute__ ((aligned(32)));
+static __m256i psi_i_p3_m1 __attribute__ ((aligned(32)));
+static __m256i psi_i_p3_p1 __attribute__ ((aligned(32)));
+static __m256i psi_i_p3_p3 __attribute__ ((aligned(32)));
+static __m256i psi_i_p3_p5 __attribute__ ((aligned(32)));
+static __m256i psi_i_p3_p7 __attribute__ ((aligned(32)));
+static __m256i psi_i_p5_m7 __attribute__ ((aligned(32)));
+static __m256i psi_i_p5_m5 __attribute__ ((aligned(32)));
+static __m256i psi_i_p5_m3 __attribute__ ((aligned(32)));
+static __m256i psi_i_p5_m1 __attribute__ ((aligned(32)));
+static __m256i psi_i_p5_p1 __attribute__ ((aligned(32)));
+static __m256i psi_i_p5_p3 __attribute__ ((aligned(32)));
+static __m256i psi_i_p5_p5 __attribute__ ((aligned(32)));
+static __m256i psi_i_p5_p7 __attribute__ ((aligned(32)));
+static __m256i psi_i_p7_m7 __attribute__ ((aligned(32)));
+static __m256i psi_i_p7_m5 __attribute__ ((aligned(32)));
+static __m256i psi_i_p7_m3 __attribute__ ((aligned(32)));
+static __m256i psi_i_p7_m1 __attribute__ ((aligned(32)));
+static __m256i psi_i_p7_p1 __attribute__ ((aligned(32)));
+static __m256i psi_i_p7_p3 __attribute__ ((aligned(32)));
+static __m256i psi_i_p7_p5 __attribute__ ((aligned(32)));
+static __m256i psi_i_p7_p7 __attribute__ ((aligned(32)));
+
+static __m256i a_r_m7_m7 __attribute__ ((aligned(32)));
+static __m256i a_r_m7_m5 __attribute__ ((aligned(32)));
+static __m256i a_r_m7_m3 __attribute__ ((aligned(32)));
+static __m256i a_r_m7_m1 __attribute__ ((aligned(32)));
+static __m256i a_r_m7_p1 __attribute__ ((aligned(32)));
+static __m256i a_r_m7_p3 __attribute__ ((aligned(32)));
+static __m256i a_r_m7_p5 __attribute__ ((aligned(32)));
+static __m256i a_r_m7_p7 __attribute__ ((aligned(32)));
+static __m256i a_r_m5_m7 __attribute__ ((aligned(32)));
+static __m256i a_r_m5_m5 __attribute__ ((aligned(32)));
+static __m256i a_r_m5_m3 __attribute__ ((aligned(32)));
+static __m256i a_r_m5_m1 __attribute__ ((aligned(32)));
+static __m256i a_r_m5_p1 __attribute__ ((aligned(32)));
+static __m256i a_r_m5_p3 __attribute__ ((aligned(32)));
+static __m256i a_r_m5_p5 __attribute__ ((aligned(32)));
+static __m256i a_r_m5_p7 __attribute__ ((aligned(32)));
+static __m256i a_r_m3_m7 __attribute__ ((aligned(32)));
+static __m256i a_r_m3_m5 __attribute__ ((aligned(32)));
+static __m256i a_r_m3_m3 __attribute__ ((aligned(32)));
+static __m256i a_r_m3_m1 __attribute__ ((aligned(32)));
+static __m256i a_r_m3_p1 __attribute__ ((aligned(32)));
+static __m256i a_r_m3_p3 __attribute__ ((aligned(32)));
+static __m256i a_r_m3_p5 __attribute__ ((aligned(32)));
+static __m256i a_r_m3_p7 __attribute__ ((aligned(32)));
+static __m256i a_r_m1_m7 __attribute__ ((aligned(32)));
+static __m256i a_r_m1_m5 __attribute__ ((aligned(32)));
+static __m256i a_r_m1_m3 __attribute__ ((aligned(32)));
+static __m256i a_r_m1_m1 __attribute__ ((aligned(32)));
+static __m256i a_r_m1_p1 __attribute__ ((aligned(32)));
+static __m256i a_r_m1_p3 __attribute__ ((aligned(32)));
+static __m256i a_r_m1_p5 __attribute__ ((aligned(32)));
+static __m256i a_r_m1_p7 __attribute__ ((aligned(32)));
+static __m256i a_r_p1_m7 __attribute__ ((aligned(32)));
+static __m256i a_r_p1_m5 __attribute__ ((aligned(32)));
+static __m256i a_r_p1_m3 __attribute__ ((aligned(32)));
+static __m256i a_r_p1_m1 __attribute__ ((aligned(32)));
+static __m256i a_r_p1_p1 __attribute__ ((aligned(32)));
+static __m256i a_r_p1_p3 __attribute__ ((aligned(32)));
+static __m256i a_r_p1_p5 __attribute__ ((aligned(32)));
+static __m256i a_r_p1_p7 __attribute__ ((aligned(32)));
+static __m256i a_r_p3_m7 __attribute__ ((aligned(32)));
+static __m256i a_r_p3_m5 __attribute__ ((aligned(32)));
+static __m256i a_r_p3_m3 __attribute__ ((aligned(32)));
+static __m256i a_r_p3_m1 __attribute__ ((aligned(32)));
+static __m256i a_r_p3_p1 __attribute__ ((aligned(32)));
+static __m256i a_r_p3_p3 __attribute__ ((aligned(32)));
+static __m256i a_r_p3_p5 __attribute__ ((aligned(32)));
+static __m256i a_r_p3_p7 __attribute__ ((aligned(32)));
+static __m256i a_r_p5_m7 __attribute__ ((aligned(32)));
+static __m256i a_r_p5_m5 __attribute__ ((aligned(32)));
+static __m256i a_r_p5_m3 __attribute__ ((aligned(32)));
+static __m256i a_r_p5_m1 __attribute__ ((aligned(32)));
+static __m256i a_r_p5_p1 __attribute__ ((aligned(32)));
+static __m256i a_r_p5_p3 __attribute__ ((aligned(32)));
+static __m256i a_r_p5_p5 __attribute__ ((aligned(32)));
+static __m256i a_r_p5_p7 __attribute__ ((aligned(32)));
+static __m256i a_r_p7_m7 __attribute__ ((aligned(32)));
+static __m256i a_r_p7_m5 __attribute__ ((aligned(32)));
+static __m256i a_r_p7_m3 __attribute__ ((aligned(32)));
+static __m256i a_r_p7_m1 __attribute__ ((aligned(32)));
+static __m256i a_r_p7_p1 __attribute__ ((aligned(32)));
+static __m256i a_r_p7_p3 __attribute__ ((aligned(32)));
+static __m256i a_r_p7_p5 __attribute__ ((aligned(32)));
+static __m256i a_r_p7_p7 __attribute__ ((aligned(32)));
+
+static __m256i a_i_m7_m7 __attribute__ ((aligned(32)));
+static __m256i a_i_m7_m5 __attribute__ ((aligned(32)));
+static __m256i a_i_m7_m3 __attribute__ ((aligned(32)));
+static __m256i a_i_m7_m1 __attribute__ ((aligned(32)));
+static __m256i a_i_m7_p1 __attribute__ ((aligned(32)));
+static __m256i a_i_m7_p3 __attribute__ ((aligned(32)));
+static __m256i a_i_m7_p5 __attribute__ ((aligned(32)));
+static __m256i a_i_m7_p7 __attribute__ ((aligned(32)));
+static __m256i a_i_m5_m7 __attribute__ ((aligned(32)));
+static __m256i a_i_m5_m5 __attribute__ ((aligned(32)));
+static __m256i a_i_m5_m3 __attribute__ ((aligned(32)));
+static __m256i a_i_m5_m1 __attribute__ ((aligned(32)));
+static __m256i a_i_m5_p1 __attribute__ ((aligned(32)));
+static __m256i a_i_m5_p3 __attribute__ ((aligned(32)));
+static __m256i a_i_m5_p5 __attribute__ ((aligned(32)));
+static __m256i a_i_m5_p7 __attribute__ ((aligned(32)));
+static __m256i a_i_m3_m7 __attribute__ ((aligned(32)));
+static __m256i a_i_m3_m5 __attribute__ ((aligned(32)));
+static __m256i a_i_m3_m3 __attribute__ ((aligned(32)));
+static __m256i a_i_m3_m1 __attribute__ ((aligned(32)));
+static __m256i a_i_m3_p1 __attribute__ ((aligned(32)));
+static __m256i a_i_m3_p3 __attribute__ ((aligned(32)));
+static __m256i a_i_m3_p5 __attribute__ ((aligned(32)));
+static __m256i a_i_m3_p7 __attribute__ ((aligned(32)));
+static __m256i a_i_m1_m7 __attribute__ ((aligned(32)));
+static __m256i a_i_m1_m5 __attribute__ ((aligned(32)));
+static __m256i a_i_m1_m3 __attribute__ ((aligned(32)));
+static __m256i a_i_m1_m1 __attribute__ ((aligned(32)));
+static __m256i a_i_m1_p1 __attribute__ ((aligned(32)));
+static __m256i a_i_m1_p3 __attribute__ ((aligned(32)));
+static __m256i a_i_m1_p5 __attribute__ ((aligned(32)));
+static __m256i a_i_m1_p7 __attribute__ ((aligned(32)));
+static __m256i a_i_p1_m7 __attribute__ ((aligned(32)));
+static __m256i a_i_p1_m5 __attribute__ ((aligned(32)));
+static __m256i a_i_p1_m3 __attribute__ ((aligned(32)));
+static __m256i a_i_p1_m1 __attribute__ ((aligned(32)));
+static __m256i a_i_p1_p1 __attribute__ ((aligned(32)));
+static __m256i a_i_p1_p3 __attribute__ ((aligned(32)));
+static __m256i a_i_p1_p5 __attribute__ ((aligned(32)));
+static __m256i a_i_p1_p7 __attribute__ ((aligned(32)));
+static __m256i a_i_p3_m7 __attribute__ ((aligned(32)));
+static __m256i a_i_p3_m5 __attribute__ ((aligned(32)));
+static __m256i a_i_p3_m3 __attribute__ ((aligned(32)));
+static __m256i a_i_p3_m1 __attribute__ ((aligned(32)));
+static __m256i a_i_p3_p1 __attribute__ ((aligned(32)));
+static __m256i a_i_p3_p3 __attribute__ ((aligned(32)));
+static __m256i a_i_p3_p5 __attribute__ ((aligned(32)));
+static __m256i a_i_p3_p7 __attribute__ ((aligned(32)));
+static __m256i a_i_p5_m7 __attribute__ ((aligned(32)));
+static __m256i a_i_p5_m5 __attribute__ ((aligned(32)));
+static __m256i a_i_p5_m3 __attribute__ ((aligned(32)));
+static __m256i a_i_p5_m1 __attribute__ ((aligned(32)));
+static __m256i a_i_p5_p1 __attribute__ ((aligned(32)));
+static __m256i a_i_p5_p3 __attribute__ ((aligned(32)));
+static __m256i a_i_p5_p5 __attribute__ ((aligned(32)));
+static __m256i a_i_p5_p7 __attribute__ ((aligned(32)));
+static __m256i a_i_p7_m7 __attribute__ ((aligned(32)));
+static __m256i a_i_p7_m5 __attribute__ ((aligned(32)));
+static __m256i a_i_p7_m3 __attribute__ ((aligned(32)));
+static __m256i a_i_p7_m1 __attribute__ ((aligned(32)));
+static __m256i a_i_p7_p1 __attribute__ ((aligned(32)));
+static __m256i a_i_p7_p3 __attribute__ ((aligned(32)));
+static __m256i a_i_p7_p5 __attribute__ ((aligned(32)));
+static __m256i a_i_p7_p7 __attribute__ ((aligned(32)));
+
+static __m256i psi_a_m7_m7 __attribute__ ((aligned(32)));
+static __m256i psi_a_m7_m5 __attribute__ ((aligned(32)));
+static __m256i psi_a_m7_m3 __attribute__ ((aligned(32)));
+static __m256i psi_a_m7_m1 __attribute__ ((aligned(32)));
+static __m256i psi_a_m7_p1 __attribute__ ((aligned(32)));
+static __m256i psi_a_m7_p3 __attribute__ ((aligned(32)));
+static __m256i psi_a_m7_p5 __attribute__ ((aligned(32)));
+static __m256i psi_a_m7_p7 __attribute__ ((aligned(32)));
+static __m256i psi_a_m5_m7 __attribute__ ((aligned(32)));
+static __m256i psi_a_m5_m5 __attribute__ ((aligned(32)));
+static __m256i psi_a_m5_m3 __attribute__ ((aligned(32)));
+static __m256i psi_a_m5_m1 __attribute__ ((aligned(32)));
+static __m256i psi_a_m5_p1 __attribute__ ((aligned(32)));
+static __m256i psi_a_m5_p3 __attribute__ ((aligned(32)));
+static __m256i psi_a_m5_p5 __attribute__ ((aligned(32)));
+static __m256i psi_a_m5_p7 __attribute__ ((aligned(32)));
+static __m256i psi_a_m3_m7 __attribute__ ((aligned(32)));
+static __m256i psi_a_m3_m5 __attribute__ ((aligned(32)));
+static __m256i psi_a_m3_m3 __attribute__ ((aligned(32)));
+static __m256i psi_a_m3_m1 __attribute__ ((aligned(32)));
+static __m256i psi_a_m3_p1 __attribute__ ((aligned(32)));
+static __m256i psi_a_m3_p3 __attribute__ ((aligned(32)));
+static __m256i psi_a_m3_p5 __attribute__ ((aligned(32)));
+static __m256i psi_a_m3_p7 __attribute__ ((aligned(32)));
+static __m256i psi_a_m1_m7 __attribute__ ((aligned(32)));
+static __m256i psi_a_m1_m5 __attribute__ ((aligned(32)));
+static __m256i psi_a_m1_m3 __attribute__ ((aligned(32)));
+static __m256i psi_a_m1_m1 __attribute__ ((aligned(32)));
+static __m256i psi_a_m1_p1 __attribute__ ((aligned(32)));
+static __m256i psi_a_m1_p3 __attribute__ ((aligned(32)));
+static __m256i psi_a_m1_p5 __attribute__ ((aligned(32)));
+static __m256i psi_a_m1_p7 __attribute__ ((aligned(32)));
+static __m256i psi_a_p1_m7 __attribute__ ((aligned(32)));
+static __m256i psi_a_p1_m5 __attribute__ ((aligned(32)));
+static __m256i psi_a_p1_m3 __attribute__ ((aligned(32)));
+static __m256i psi_a_p1_m1 __attribute__ ((aligned(32)));
+static __m256i psi_a_p1_p1 __attribute__ ((aligned(32)));
+static __m256i psi_a_p1_p3 __attribute__ ((aligned(32)));
+static __m256i psi_a_p1_p5 __attribute__ ((aligned(32)));
+static __m256i psi_a_p1_p7 __attribute__ ((aligned(32)));
+static __m256i psi_a_p3_m7 __attribute__ ((aligned(32)));
+static __m256i psi_a_p3_m5 __attribute__ ((aligned(32)));
+static __m256i psi_a_p3_m3 __attribute__ ((aligned(32)));
+static __m256i psi_a_p3_m1 __attribute__ ((aligned(32)));
+static __m256i psi_a_p3_p1 __attribute__ ((aligned(32)));
+static __m256i psi_a_p3_p3 __attribute__ ((aligned(32)));
+static __m256i psi_a_p3_p5 __attribute__ ((aligned(32)));
+static __m256i psi_a_p3_p7 __attribute__ ((aligned(32)));
+static __m256i psi_a_p5_m7 __attribute__ ((aligned(32)));
+static __m256i psi_a_p5_m5 __attribute__ ((aligned(32)));
+static __m256i psi_a_p5_m3 __attribute__ ((aligned(32)));
+static __m256i psi_a_p5_m1 __attribute__ ((aligned(32)));
+static __m256i psi_a_p5_p1 __attribute__ ((aligned(32)));
+static __m256i psi_a_p5_p3 __attribute__ ((aligned(32)));
+static __m256i psi_a_p5_p5 __attribute__ ((aligned(32)));
+static __m256i psi_a_p5_p7 __attribute__ ((aligned(32)));
+static __m256i psi_a_p7_m7 __attribute__ ((aligned(32)));
+static __m256i psi_a_p7_m5 __attribute__ ((aligned(32)));
+static __m256i psi_a_p7_m3 __attribute__ ((aligned(32)));
+static __m256i psi_a_p7_m1 __attribute__ ((aligned(32)));
+static __m256i psi_a_p7_p1 __attribute__ ((aligned(32)));
+static __m256i psi_a_p7_p3 __attribute__ ((aligned(32)));
+static __m256i psi_a_p7_p5 __attribute__ ((aligned(32)));
+static __m256i psi_a_p7_p7 __attribute__ ((aligned(32)));
+
+static __m256i a_sq_m7_m7 __attribute__ ((aligned(32)));
+static __m256i a_sq_m7_m5 __attribute__ ((aligned(32)));
+static __m256i a_sq_m7_m3 __attribute__ ((aligned(32)));
+static __m256i a_sq_m7_m1 __attribute__ ((aligned(32)));
+static __m256i a_sq_m7_p1 __attribute__ ((aligned(32)));
+static __m256i a_sq_m7_p3 __attribute__ ((aligned(32)));
+static __m256i a_sq_m7_p5 __attribute__ ((aligned(32)));
+static __m256i a_sq_m7_p7 __attribute__ ((aligned(32)));
+static __m256i a_sq_m5_m7 __attribute__ ((aligned(32)));
+static __m256i a_sq_m5_m5 __attribute__ ((aligned(32)));
+static __m256i a_sq_m5_m3 __attribute__ ((aligned(32)));
+static __m256i a_sq_m5_m1 __attribute__ ((aligned(32)));
+static __m256i a_sq_m5_p1 __attribute__ ((aligned(32)));
+static __m256i a_sq_m5_p3 __attribute__ ((aligned(32)));
+static __m256i a_sq_m5_p5 __attribute__ ((aligned(32)));
+static __m256i a_sq_m5_p7 __attribute__ ((aligned(32)));
+static __m256i a_sq_m3_m7 __attribute__ ((aligned(32)));
+static __m256i a_sq_m3_m5 __attribute__ ((aligned(32)));
+static __m256i a_sq_m3_m3 __attribute__ ((aligned(32)));
+static __m256i a_sq_m3_m1 __attribute__ ((aligned(32)));
+static __m256i a_sq_m3_p1 __attribute__ ((aligned(32)));
+static __m256i a_sq_m3_p3 __attribute__ ((aligned(32)));
+static __m256i a_sq_m3_p5 __attribute__ ((aligned(32)));
+static __m256i a_sq_m3_p7 __attribute__ ((aligned(32)));
+static __m256i a_sq_m1_m7 __attribute__ ((aligned(32)));
+static __m256i a_sq_m1_m5 __attribute__ ((aligned(32)));
+static __m256i a_sq_m1_m3 __attribute__ ((aligned(32)));
+static __m256i a_sq_m1_m1 __attribute__ ((aligned(32)));
+static __m256i a_sq_m1_p1 __attribute__ ((aligned(32)));
+static __m256i a_sq_m1_p3 __attribute__ ((aligned(32)));
+static __m256i a_sq_m1_p5 __attribute__ ((aligned(32)));
+static __m256i a_sq_m1_p7 __attribute__ ((aligned(32)));
+static __m256i a_sq_p1_m7 __attribute__ ((aligned(32)));
+static __m256i a_sq_p1_m5 __attribute__ ((aligned(32)));
+static __m256i a_sq_p1_m3 __attribute__ ((aligned(32)));
+static __m256i a_sq_p1_m1 __attribute__ ((aligned(32)));
+static __m256i a_sq_p1_p1 __attribute__ ((aligned(32)));
+static __m256i a_sq_p1_p3 __attribute__ ((aligned(32)));
+static __m256i a_sq_p1_p5 __attribute__ ((aligned(32)));
+static __m256i a_sq_p1_p7 __attribute__ ((aligned(32)));
+static __m256i a_sq_p3_m7 __attribute__ ((aligned(32)));
+static __m256i a_sq_p3_m5 __attribute__ ((aligned(32)));
+static __m256i a_sq_p3_m3 __attribute__ ((aligned(32)));
+static __m256i a_sq_p3_m1 __attribute__ ((aligned(32)));
+static __m256i a_sq_p3_p1 __attribute__ ((aligned(32)));
+static __m256i a_sq_p3_p3 __attribute__ ((aligned(32)));
+static __m256i a_sq_p3_p5 __attribute__ ((aligned(32)));
+static __m256i a_sq_p3_p7 __attribute__ ((aligned(32)));
+static __m256i a_sq_p5_m7 __attribute__ ((aligned(32)));
+static __m256i a_sq_p5_m5 __attribute__ ((aligned(32)));
+static __m256i a_sq_p5_m3 __attribute__ ((aligned(32)));
+static __m256i a_sq_p5_m1 __attribute__ ((aligned(32)));
+static __m256i a_sq_p5_p1 __attribute__ ((aligned(32)));
+static __m256i a_sq_p5_p3 __attribute__ ((aligned(32)));
+static __m256i a_sq_p5_p5 __attribute__ ((aligned(32)));
+static __m256i a_sq_p5_p7 __attribute__ ((aligned(32)));
+static __m256i a_sq_p7_m7 __attribute__ ((aligned(32)));
+static __m256i a_sq_p7_m5 __attribute__ ((aligned(32)));
+static __m256i a_sq_p7_m3 __attribute__ ((aligned(32)));
+static __m256i a_sq_p7_m1 __attribute__ ((aligned(32)));
+static __m256i a_sq_p7_p1 __attribute__ ((aligned(32)));
+static __m256i a_sq_p7_p3 __attribute__ ((aligned(32)));
+static __m256i a_sq_p7_p5 __attribute__ ((aligned(32)));
+static __m256i a_sq_p7_p7 __attribute__ ((aligned(32)));
+
+static __m256i bit_met_m7_m7 __attribute__ ((aligned(32)));
+static __m256i bit_met_m7_m5 __attribute__ ((aligned(32)));
+static __m256i bit_met_m7_m3 __attribute__ ((aligned(32)));
+static __m256i bit_met_m7_m1 __attribute__ ((aligned(32)));
+static __m256i bit_met_m7_p1 __attribute__ ((aligned(32)));
+static __m256i bit_met_m7_p3 __attribute__ ((aligned(32)));
+static __m256i bit_met_m7_p5 __attribute__ ((aligned(32)));
+static __m256i bit_met_m7_p7 __attribute__ ((aligned(32)));
+static __m256i bit_met_m5_m7 __attribute__ ((aligned(32)));
+static __m256i bit_met_m5_m5 __attribute__ ((aligned(32)));
+static __m256i bit_met_m5_m3 __attribute__ ((aligned(32)));
+static __m256i bit_met_m5_m1 __attribute__ ((aligned(32)));
+static __m256i bit_met_m5_p1 __attribute__ ((aligned(32)));
+static __m256i bit_met_m5_p3 __attribute__ ((aligned(32)));
+static __m256i bit_met_m5_p5 __attribute__ ((aligned(32)));
+static __m256i bit_met_m5_p7 __attribute__ ((aligned(32)));
+static __m256i bit_met_m3_m7 __attribute__ ((aligned(32)));
+static __m256i bit_met_m3_m5 __attribute__ ((aligned(32)));
+static __m256i bit_met_m3_m3 __attribute__ ((aligned(32)));
+static __m256i bit_met_m3_m1 __attribute__ ((aligned(32)));
+static __m256i bit_met_m3_p1 __attribute__ ((aligned(32)));
+static __m256i bit_met_m3_p3 __attribute__ ((aligned(32)));
+static __m256i bit_met_m3_p5 __attribute__ ((aligned(32)));
+static __m256i bit_met_m3_p7 __attribute__ ((aligned(32)));
+static __m256i bit_met_m1_m7 __attribute__ ((aligned(32)));
+static __m256i bit_met_m1_m5 __attribute__ ((aligned(32)));
+static __m256i bit_met_m1_m3 __attribute__ ((aligned(32)));
+static __m256i bit_met_m1_m1 __attribute__ ((aligned(32)));
+static __m256i bit_met_m1_p1 __attribute__ ((aligned(32)));
+static __m256i bit_met_m1_p3 __attribute__ ((aligned(32)));
+static __m256i bit_met_m1_p5 __attribute__ ((aligned(32)));
+static __m256i bit_met_m1_p7 __attribute__ ((aligned(32)));
+static __m256i bit_met_p1_m7 __attribute__ ((aligned(32)));
+static __m256i bit_met_p1_m5 __attribute__ ((aligned(32)));
+static __m256i bit_met_p1_m3 __attribute__ ((aligned(32)));
+static __m256i bit_met_p1_m1 __attribute__ ((aligned(32)));
+static __m256i bit_met_p1_p1 __attribute__ ((aligned(32)));
+static __m256i bit_met_p1_p3 __attribute__ ((aligned(32)));
+static __m256i bit_met_p1_p5 __attribute__ ((aligned(32)));
+static __m256i bit_met_p1_p7 __attribute__ ((aligned(32)));
+static __m256i bit_met_p3_m7 __attribute__ ((aligned(32)));
+static __m256i bit_met_p3_m5 __attribute__ ((aligned(32)));
+static __m256i bit_met_p3_m3 __attribute__ ((aligned(32)));
+static __m256i bit_met_p3_m1 __attribute__ ((aligned(32)));
+static __m256i bit_met_p3_p1 __attribute__ ((aligned(32)));
+static __m256i bit_met_p3_p3 __attribute__ ((aligned(32)));
+static __m256i bit_met_p3_p5 __attribute__ ((aligned(32)));
+static __m256i bit_met_p3_p7 __attribute__ ((aligned(32)));
+static __m256i bit_met_p5_m7 __attribute__ ((aligned(32)));
+static __m256i bit_met_p5_m5 __attribute__ ((aligned(32)));
+static __m256i bit_met_p5_m3 __attribute__ ((aligned(32)));
+static __m256i bit_met_p5_m1 __attribute__ ((aligned(32)));
+static __m256i bit_met_p5_p1 __attribute__ ((aligned(32)));
+static __m256i bit_met_p5_p3 __attribute__ ((aligned(32)));
+static __m256i bit_met_p5_p5 __attribute__ ((aligned(32)));
+static __m256i bit_met_p5_p7 __attribute__ ((aligned(32)));
+static __m256i bit_met_p7_m7 __attribute__ ((aligned(32)));
+static __m256i bit_met_p7_m5 __attribute__ ((aligned(32)));
+static __m256i bit_met_p7_m3 __attribute__ ((aligned(32)));
+static __m256i bit_met_p7_m1 __attribute__ ((aligned(32)));
+static __m256i bit_met_p7_p1 __attribute__ ((aligned(32)));
+static __m256i bit_met_p7_p3 __attribute__ ((aligned(32)));
+static __m256i bit_met_p7_p5 __attribute__ ((aligned(32)));
+static __m256i bit_met_p7_p7 __attribute__ ((aligned(32)));
+
+static __m256i  y0_p_1_1 __attribute__ ((aligned(32)));
+static __m256i  y0_p_1_3 __attribute__ ((aligned(32)));
+static __m256i  y0_p_1_5 __attribute__ ((aligned(32)));
+static __m256i  y0_p_1_7 __attribute__ ((aligned(32)));
+static __m256i  y0_p_3_1 __attribute__ ((aligned(32)));
+static __m256i  y0_p_3_3 __attribute__ ((aligned(32)));
+static __m256i  y0_p_3_5 __attribute__ ((aligned(32)));
+static __m256i  y0_p_3_7 __attribute__ ((aligned(32)));
+static __m256i  y0_p_5_1 __attribute__ ((aligned(32)));
+static __m256i  y0_p_5_3 __attribute__ ((aligned(32)));
+static __m256i  y0_p_5_5 __attribute__ ((aligned(32)));
+static __m256i  y0_p_5_7 __attribute__ ((aligned(32)));
+static __m256i  y0_p_7_1 __attribute__ ((aligned(32)));
+static __m256i  y0_p_7_3 __attribute__ ((aligned(32)));
+static __m256i  y0_p_7_5 __attribute__ ((aligned(32)));
+static __m256i  y0_p_7_7 __attribute__ ((aligned(32)));
+static __m256i  y0_m_1_1 __attribute__ ((aligned(32)));
+static __m256i  y0_m_1_3 __attribute__ ((aligned(32)));
+static __m256i  y0_m_1_5 __attribute__ ((aligned(32)));
+static __m256i  y0_m_1_7 __attribute__ ((aligned(32)));
+static __m256i  y0_m_3_1 __attribute__ ((aligned(32)));
+static __m256i  y0_m_3_3 __attribute__ ((aligned(32)));
+static __m256i  y0_m_3_5 __attribute__ ((aligned(32)));
+static __m256i  y0_m_3_7 __attribute__ ((aligned(32)));
+static __m256i  y0_m_5_1 __attribute__ ((aligned(32)));
+static __m256i  y0_m_5_3 __attribute__ ((aligned(32)));
+static __m256i  y0_m_5_5 __attribute__ ((aligned(32)));
+static __m256i  y0_m_5_7 __attribute__ ((aligned(32)));
+static __m256i  y0_m_7_1 __attribute__ ((aligned(32)));
+static __m256i  y0_m_7_3 __attribute__ ((aligned(32)));
+static __m256i  y0_m_7_5 __attribute__ ((aligned(32)));
+static __m256i  y0_m_7_7 __attribute__ ((aligned(32)));
+
+static __m256i  xmm0 __attribute__ ((aligned(32)));
+static __m256i  xmm1 __attribute__ ((aligned(32)));
+static __m256i  xmm2 __attribute__ ((aligned(32)));
+static __m256i  xmm3 __attribute__ ((aligned(32)));
+static __m256i  xmm4 __attribute__ ((aligned(32)));
+static __m256i  xmm5 __attribute__ ((aligned(32)));
+static __m256i  xmm6 __attribute__ ((aligned(32)));
+static __m256i  xmm7 __attribute__ ((aligned(32)));
+static __m256i  xmm8 __attribute__ ((aligned(32)));
+
+static __m256i  y0r __attribute__ ((aligned(32)));
+static __m256i  y0i __attribute__ ((aligned(32)));
+static __m256i  y1r __attribute__ ((aligned(32)));
+static __m256i  y1i __attribute__ ((aligned(32)));
+static __m256i  y2r __attribute__ ((aligned(32)));
+static __m256i  y2i __attribute__ ((aligned(32)));
+
+static __m256i  logmax_num_re0 __attribute__ ((aligned(32)));
+static __m256i  logmax_den_re0 __attribute__ ((aligned(32)));
+
+static __m256i tmp_result  __attribute__ ((aligned(32)));
+static __m256i tmp_result2 __attribute__ ((aligned(32)));
+static __m256i tmp_result3 __attribute__ ((aligned(32)));
+static __m256i tmp_result4 __attribute__ ((aligned(32)));
+
+//==============================================================================================
+// Auxiliary Makros
+
+// calculate interference magnitude
+#define interference_abs_epi16(psi,int_ch_mag,int_mag,c1,c2) tmp_result = _mm256_cmpgt_epi16(int_ch_mag,psi); tmp_result2 = _mm256_xor_si256(tmp_result,(*(__m256i*)&ones256[0])); tmp_result = _mm256_and_si256(tmp_result,c1); tmp_result2 = _mm256_and_si256(tmp_result2,c2); int_mag = _mm256_or_si256(tmp_result,tmp_result2);
+
+// calculate interference magnitude
+// tmp_result = ones in shorts corr. to interval 2<=x<=4, tmp_result2 interval < 2, tmp_result3 interval 4<x<6 and tmp_result4 interval x>6
+#define interference_abs_64qam_epi16(psi,int_ch_mag,int_two_ch_mag,int_three_ch_mag,a,c1,c3,c5,c7) tmp_result = _mm256_cmpgt_epi16(int_two_ch_mag,psi); tmp_result3 = _mm256_xor_si256(tmp_result,(*(__m256i*)&ones256[0])); tmp_result2 = _mm256_cmpgt_epi16(int_ch_mag,psi); tmp_result = _mm256_xor_si256(tmp_result,tmp_result2); tmp_result4 = _mm256_cmpgt_epi16(psi,int_three_ch_mag); tmp_result3 = _mm256_xor_si256(tmp_result3,tmp_result4); tmp_result = _mm256_and_si256(tmp_result,c3); tmp_result2 = _mm256_and_si256(tmp_result2,c1); tmp_result3 = _mm256_and_si256(tmp_result3,c5); tmp_result4 = _mm256_and_si256(tmp_result4,c7); tmp_result = _mm256_or_si256(tmp_result,tmp_result2); tmp_result3 = _mm256_or_si256(tmp_result3,tmp_result4); a = _mm256_or_si256(tmp_result,tmp_result3);
+
+// calculates psi_a = psi_r*a_r + psi_i*a_i
+#define prodsum_psi_a_epi16(psi_r,a_r,psi_i,a_i,psi_a) tmp_result = _mm256_mulhi_epi16(psi_r,a_r); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result2 = _mm256_mulhi_epi16(psi_i,a_i); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); psi_a = _mm256_adds_epi16(tmp_result,tmp_result2);
+
+// calculates a_sq = int_ch_mag*(a_r^2 + a_i^2)*scale_factor
+#define square_a_epi16(a_r,a_i,int_ch_mag,scale_factor,a_sq) tmp_result = _mm256_mulhi_epi16(a_r,a_r); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result = _mm256_mulhi_epi16(tmp_result,scale_factor); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result = _mm256_mulhi_epi16(tmp_result,int_ch_mag); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result2 = _mm256_mulhi_epi16(a_i,a_i); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); tmp_result2 = _mm256_mulhi_epi16(tmp_result2,scale_factor); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); tmp_result2 = _mm256_mulhi_epi16(tmp_result2,int_ch_mag); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); a_sq = _mm256_adds_epi16(tmp_result,tmp_result2);
+
+// calculates a_sq = int_ch_mag*(a_r^2 + a_i^2)*scale_factor for 64-QAM
+#define square_a_64qam_epi16(a_r,a_i,int_ch_mag,scale_factor,a_sq)  tmp_result = _mm256_mulhi_epi16(a_r,a_r); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result = _mm256_mulhi_epi16(tmp_result,scale_factor); tmp_result = _mm256_slli_epi16(tmp_result,3); tmp_result = _mm256_mulhi_epi16(tmp_result,int_ch_mag); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result2 = _mm256_mulhi_epi16(a_i,a_i); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); tmp_result2 = _mm256_mulhi_epi16(tmp_result2,scale_factor); tmp_result2 = _mm256_slli_epi16(tmp_result2,3); tmp_result2 = _mm256_mulhi_epi16(tmp_result2,int_ch_mag); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); a_sq = _mm256_adds_epi16(tmp_result,tmp_result2);
+
+void seperate_real_imag_parts(__m256i *out_re,
+                              __m256i *out_im,
+                              __m256i in0,
+                              __m256i in1)
+{
+    __m256i tmp0;
+    __m256i tmp1;
+
+    in0 = _mm256_shufflelo_epi16(in0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    in0 = _mm256_shufflehi_epi16(in0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    in0 = _mm256_shuffle_epi32(in0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+
+    in1 = _mm256_shufflelo_epi16(in1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    in1 = _mm256_shufflehi_epi16(in1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    in1 = _mm256_shuffle_epi32(in1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+
+    //in0 = [Re(0,1,2,3)   Im(0,1,2,3)   Re(4,5,6,7)     Im(4,5,6,7)]
+    //in0 = [Re(8,9,10,11) Im(8,9,10,11) Re(12,13,14,15) Im(12,13,14,15)]
+
+    tmp0 = _mm256_unpacklo_epi64(in0, in1);
+    //axmm2 = [Re(0,1,2,3) Re(8,9,10,11) Re(4,5,6,7) Re(12,13,14,15)]
+    tmp0 = _mm256_permute4x64_epi64(tmp0,0xd8); // Re(rho)
+
+    tmp1 = _mm256_unpackhi_epi64(in0, in1);
+    //axmm3 = [Im(0,1,2,3) Im(8,9,10,11) Im(4,5,6,7) Im(12,13,14,15)]
+    tmp1 = _mm256_permute4x64_epi64(tmp1,0xd8); // Im(rho)
+
+    *out_re = tmp0;
+    *out_im = tmp1;
+}
+
+void qam64_qam16_avx2(short *stream0_in,
+                      short *stream1_in,
+                      short *ch_mag,
+                      short *ch_mag_i,
+                      short *stream0_out,
+                      short *rho01,
+                      int length
+    )
+{
+
+  /*
+    Author: S. Wagner
+    Date: 31-07-12
+
+    Input:
+    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
+    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
+    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    rho01:       Channel cross correlation, i.e., h1'*h0
+
+    Output:
+    stream0_out: output LLRs for 1st stream
+  */
+
+#if defined(__x86_64__) || defined(__i386__)
+
+  __m256i *rho01_256i      = (__m256i *)rho01;
+  __m256i *stream0_256i_in = (__m256i *)stream0_in;
+  __m256i *stream1_256i_in = (__m256i *)stream1_in;
+  __m256i *ch_mag_256i     = (__m256i *)ch_mag;
+  __m256i *ch_mag_256i_i   = (__m256i *)ch_mag_i;
+
+  __m256i ONE_OVER_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(10112)); // round(1/sqrt(42)*2^16)
+  __m256i THREE_OVER_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(30337)); // round(3/sqrt(42)*2^16)
+  __m256i FIVE_OVER_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(25281)); // round(5/sqrt(42)*2^15)
+  __m256i SEVEN_OVER_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(17697)); // round(5/sqrt(42)*2^15)
+  __m256i FORTYNINE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(30969)); // round(49/(4*sqrt(42))*2^14), Q2.14
+  __m256i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(23385)); // round(37/(4*sqrt(42))*2^14), Q2.14
+  __m256i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(31601)); // round(25/(4*sqrt(42))*2^15)
+  __m256i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(18329)); // round(29/(4*sqrt(42))*2^15), Q2.14
+  __m256i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(21489)); // round(17/(4*sqrt(42))*2^15)
+  __m256i NINE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(11376)); // round(9/(4*sqrt(42))*2^15)
+  __m256i THIRTEEN_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(16433)); // round(13/(4*sqrt(42))*2^15)
+  __m256i FIVE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(6320)); // round(5/(4*sqrt(42))*2^15)
+  __m256i ONE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(1264)); // round(1/(4*sqrt(42))*2^15)
+  __m256i ONE_OVER_SQRT_10_Q15 = _mm256_broadcastw_epi16(_mm_set1_epi16(10362)); // round(1/sqrt(10)*2^15)
+  __m256i THREE_OVER_SQRT_10 = _mm256_broadcastw_epi16(_mm_set1_epi16(31086)); // round(3/sqrt(10)*2^15)
+  __m256i SQRT_10_OVER_FOUR = _mm256_broadcastw_epi16(_mm_set1_epi16(25905)); // round(sqrt(10)/4*2^15)
+
+
+  __m256i ch_mag_int;
+  __m256i ch_mag_des;
+  __m256i ch_mag_98_over_42_with_sigma2;
+  __m256i ch_mag_74_over_42_with_sigma2;
+  __m256i ch_mag_58_over_42_with_sigma2;
+  __m256i ch_mag_50_over_42_with_sigma2;
+  __m256i ch_mag_34_over_42_with_sigma2;
+  __m256i ch_mag_18_over_42_with_sigma2;
+  __m256i ch_mag_26_over_42_with_sigma2;
+  __m256i ch_mag_10_over_42_with_sigma2;
+  __m256i ch_mag_2_over_42_with_sigma2;
+  __m256i  y0r_one_over_sqrt_21;
+  __m256i  y0r_three_over_sqrt_21;
+  __m256i  y0r_five_over_sqrt_21;
+  __m256i  y0r_seven_over_sqrt_21;
+  __m256i  y0i_one_over_sqrt_21;
+  __m256i  y0i_three_over_sqrt_21;
+  __m256i  y0i_five_over_sqrt_21;
+  __m256i  y0i_seven_over_sqrt_21;
+
+#elif defined(__arm__)
+
+#endif
+  int i,j;
+  uint32_t len256 = (length)>>3;
+
+  for (i=0; i<len256; i+=2) {
+
+#if defined(__x86_64__) || defined(__i386__)
+    // Get rho
+      /*
+    xmm0 = rho01_128i[i];
+    xmm1 = rho01_128i[i+1];
+    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
+    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
+    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
+    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
+      */
+    seperate_real_imag_parts(&xmm2, &xmm3, rho01_256i[i], rho01_256i[i+1]);
+
+    rho_rpi = _mm256_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
+    rho_rmi = _mm256_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)
+
+    // Compute the different rhos
+    rho_rpi_1_1 = _mm256_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42);
+    rho_rmi_1_1 = _mm256_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42);
+    rho_rpi_3_3 = _mm256_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42);
+    rho_rmi_3_3 = _mm256_mulhi_epi16(rho_rmi, THREE_OVER_SQRT_42);
+    rho_rpi_5_5 = _mm256_mulhi_epi16(rho_rpi, FIVE_OVER_SQRT_42);
+    rho_rmi_5_5 = _mm256_mulhi_epi16(rho_rmi, FIVE_OVER_SQRT_42);
+    rho_rpi_7_7 = _mm256_mulhi_epi16(rho_rpi, SEVEN_OVER_SQRT_42);
+    rho_rmi_7_7 = _mm256_mulhi_epi16(rho_rmi, SEVEN_OVER_SQRT_42);
+
+    rho_rpi_5_5 = _mm256_slli_epi16(rho_rpi_5_5, 1);
+    rho_rmi_5_5 = _mm256_slli_epi16(rho_rmi_5_5, 1);
+    rho_rpi_7_7 = _mm256_slli_epi16(rho_rpi_7_7, 2);
+    rho_rmi_7_7 = _mm256_slli_epi16(rho_rmi_7_7, 2);
+
+    xmm4 = _mm256_mulhi_epi16(xmm2, ONE_OVER_SQRT_42);
+    xmm5 = _mm256_mulhi_epi16(xmm3, ONE_OVER_SQRT_42);
+    xmm6 = _mm256_mulhi_epi16(xmm3, THREE_OVER_SQRT_42);
+    xmm7 = _mm256_mulhi_epi16(xmm3, FIVE_OVER_SQRT_42);
+    xmm8 = _mm256_mulhi_epi16(xmm3, SEVEN_OVER_SQRT_42);
+    xmm7 = _mm256_slli_epi16(xmm7, 1);
+    xmm8 = _mm256_slli_epi16(xmm8, 2);
+
+    rho_rpi_1_3 = _mm256_adds_epi16(xmm4, xmm6);
+    rho_rmi_1_3 = _mm256_subs_epi16(xmm4, xmm6);
+    rho_rpi_1_5 = _mm256_adds_epi16(xmm4, xmm7);
+    rho_rmi_1_5 = _mm256_subs_epi16(xmm4, xmm7);
+    rho_rpi_1_7 = _mm256_adds_epi16(xmm4, xmm8);
+    rho_rmi_1_7 = _mm256_subs_epi16(xmm4, xmm8);
+
+    xmm4 = _mm256_mulhi_epi16(xmm2, THREE_OVER_SQRT_42);
+    rho_rpi_3_1 = _mm256_adds_epi16(xmm4, xmm5);
+    rho_rmi_3_1 = _mm256_subs_epi16(xmm4, xmm5);
+    rho_rpi_3_5 = _mm256_adds_epi16(xmm4, xmm7);
+    rho_rmi_3_5 = _mm256_subs_epi16(xmm4, xmm7);
+    rho_rpi_3_7 = _mm256_adds_epi16(xmm4, xmm8);
+    rho_rmi_3_7 = _mm256_subs_epi16(xmm4, xmm8);
+
+    xmm4 = _mm256_mulhi_epi16(xmm2, FIVE_OVER_SQRT_42);
+    xmm4 = _mm256_slli_epi16(xmm4, 1);
+    rho_rpi_5_1 = _mm256_adds_epi16(xmm4, xmm5);
+    rho_rmi_5_1 = _mm256_subs_epi16(xmm4, xmm5);
+    rho_rpi_5_3 = _mm256_adds_epi16(xmm4, xmm6);
+    rho_rmi_5_3 = _mm256_subs_epi16(xmm4, xmm6);
+    rho_rpi_5_7 = _mm256_adds_epi16(xmm4, xmm8);
+    rho_rmi_5_7 = _mm256_subs_epi16(xmm4, xmm8);
+
+    xmm4 = _mm256_mulhi_epi16(xmm2, SEVEN_OVER_SQRT_42);
+    xmm4 = _mm256_slli_epi16(xmm4, 2);
+    rho_rpi_7_1 = _mm256_adds_epi16(xmm4, xmm5);
+    rho_rmi_7_1 = _mm256_subs_epi16(xmm4, xmm5);
+    rho_rpi_7_3 = _mm256_adds_epi16(xmm4, xmm6);
+    rho_rmi_7_3 = _mm256_subs_epi16(xmm4, xmm6);
+    rho_rpi_7_5 = _mm256_adds_epi16(xmm4, xmm7);
+    rho_rmi_7_5 = _mm256_subs_epi16(xmm4, xmm7);
+
+    // Rearrange interfering MF output
+    /*
+    xmm0 = stream1_128i_in[i];
+    xmm1 = stream1_128i_in[i+1];
+    xmm0 = _mm256_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm256_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm256_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
+    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
+    y1r = _mm256_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
+    y1i = _mm256_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]
+    */
+
+    seperate_real_imag_parts(&y1r, &y1i, stream1_256i_in[i], stream1_256i_in[i+1]);
+
+    // Psi_r calculation from rho_rpi or rho_rmi
+    xmm0 = _mm256_broadcastw_epi16(_mm_set1_epi16(0));// ZERO for abs_pi16
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_7, y1r);
+    psi_r_p7_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_5, y1r);
+    psi_r_p7_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_3, y1r);
+    psi_r_p7_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_1, y1r);
+    psi_r_p7_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_1, y1r);
+    psi_r_p7_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_3, y1r);
+    psi_r_p7_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_5, y1r);
+    psi_r_p7_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_7, y1r);
+    psi_r_p7_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_7, y1r);
+    psi_r_p5_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_5, y1r);
+    psi_r_p5_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_3, y1r);
+    psi_r_p5_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_1, y1r);
+    psi_r_p5_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_1, y1r);
+    psi_r_p5_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_3, y1r);
+    psi_r_p5_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_5, y1r);
+    psi_r_p5_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_7, y1r);
+    psi_r_p5_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_7, y1r);
+    psi_r_p3_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_5, y1r);
+    psi_r_p3_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_3, y1r);
+    psi_r_p3_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_1, y1r);
+    psi_r_p3_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_1, y1r);
+    psi_r_p3_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_3, y1r);
+    psi_r_p3_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_5, y1r);
+    psi_r_p3_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_7, y1r);
+    psi_r_p3_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_7, y1r);
+    psi_r_p1_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_5, y1r);
+    psi_r_p1_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_3, y1r);
+    psi_r_p1_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_1, y1r);
+    psi_r_p1_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_1, y1r);
+    psi_r_p1_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_3, y1r);
+    psi_r_p1_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_5, y1r);
+    psi_r_p1_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_7, y1r);
+    psi_r_p1_m7 = _mm256_abs_epi16(xmm2);
+
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_7, y1r);
+    psi_r_m1_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_5, y1r);
+    psi_r_m1_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_3, y1r);
+    psi_r_m1_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_1, y1r);
+    psi_r_m1_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_1, y1r);
+    psi_r_m1_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_3, y1r);
+    psi_r_m1_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_5, y1r);
+    psi_r_m1_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_7, y1r);
+    psi_r_m1_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_7, y1r);
+    psi_r_m3_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_5, y1r);
+    psi_r_m3_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_3, y1r);
+    psi_r_m3_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_1, y1r);
+    psi_r_m3_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_1, y1r);
+    psi_r_m3_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_3, y1r);
+    psi_r_m3_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_5, y1r);
+    psi_r_m3_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_7, y1r);
+    psi_r_m3_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_7, y1r);
+    psi_r_m5_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_5, y1r);
+    psi_r_m5_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_3, y1r);
+    psi_r_m5_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_1, y1r);
+    psi_r_m5_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_1, y1r);
+    psi_r_m5_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_3, y1r);
+    psi_r_m5_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_5, y1r);
+    psi_r_m5_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_7, y1r);
+    psi_r_m5_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_7, y1r);
+    psi_r_m7_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_5, y1r);
+    psi_r_m7_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_3, y1r);
+    psi_r_m7_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_1, y1r);
+    psi_r_m7_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_1, y1r);
+    psi_r_m7_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_3, y1r);
+    psi_r_m7_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_5, y1r);
+    psi_r_m7_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_7, y1r);
+    psi_r_m7_m7 = _mm256_abs_epi16(xmm2);
+
+    // Psi_i calculation from rho_rpi or rho_rmi
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_7, y1i);
+    psi_i_p7_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_7, y1i);
+    psi_i_p7_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_7, y1i);
+    psi_i_p7_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_7, y1i);
+    psi_i_p7_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_7, y1i);
+    psi_i_p7_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_7, y1i);
+    psi_i_p7_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_7, y1i);
+    psi_i_p7_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_7, y1i);
+    psi_i_p7_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_5, y1i);
+    psi_i_p5_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_5, y1i);
+    psi_i_p5_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_5, y1i);
+    psi_i_p5_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_5, y1i);
+    psi_i_p5_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_5, y1i);
+    psi_i_p5_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_5, y1i);
+    psi_i_p5_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_5, y1i);
+    psi_i_p5_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_5, y1i);
+    psi_i_p5_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_3, y1i);
+    psi_i_p3_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_3, y1i);
+    psi_i_p3_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_3, y1i);
+    psi_i_p3_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_3, y1i);
+    psi_i_p3_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_3, y1i);
+    psi_i_p3_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_3, y1i);
+    psi_i_p3_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_3, y1i);
+    psi_i_p3_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_3, y1i);
+    psi_i_p3_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_1, y1i);
+    psi_i_p1_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_1, y1i);
+    psi_i_p1_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_1, y1i);
+    psi_i_p1_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_1, y1i);
+    psi_i_p1_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_1, y1i);
+    psi_i_p1_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_1, y1i);
+    psi_i_p1_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_1, y1i);
+    psi_i_p1_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_1, y1i);
+    psi_i_p1_m7 = _mm256_abs_epi16(xmm2);
+
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_1, y1i);
+    psi_i_m1_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_1, y1i);
+    psi_i_m1_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_1, y1i);
+    psi_i_m1_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_1, y1i);
+    psi_i_m1_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_1, y1i);
+    psi_i_m1_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_1, y1i);
+    psi_i_m1_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_1, y1i);
+    psi_i_m1_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_1, y1i);
+    psi_i_m1_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_3, y1i);
+    psi_i_m3_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_3, y1i);
+    psi_i_m3_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_3, y1i);
+    psi_i_m3_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_3, y1i);
+    psi_i_m3_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_3, y1i);
+    psi_i_m3_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_3, y1i);
+    psi_i_m3_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_3, y1i);
+    psi_i_m3_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_3, y1i);
+    psi_i_m3_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_5, y1i);
+    psi_i_m5_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_5, y1i);
+    psi_i_m5_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_5, y1i);
+    psi_i_m5_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_5, y1i);
+    psi_i_m5_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_5, y1i);
+    psi_i_m5_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_5, y1i);
+    psi_i_m5_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_5, y1i);
+    psi_i_m5_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_5, y1i);
+    psi_i_m5_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_7, y1i);
+    psi_i_m7_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_7, y1i);
+    psi_i_m7_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_7, y1i);
+    psi_i_m7_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_7, y1i);
+    psi_i_m7_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_7, y1i);
+    psi_i_m7_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_7, y1i);
+    psi_i_m7_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_7, y1i);
+    psi_i_m7_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_7, y1i);
+    psi_i_m7_m7 = _mm256_abs_epi16(xmm2);
+
+/*
+    // Rearrange desired MF output
+    xmm0 = stream0_128i_in[i];
+    xmm1 = stream0_128i_in[i+1];
+    xmm0 = _mm256_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm256_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm256_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
+    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
+    y0r = _mm256_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
+    y0i = _mm256_unpackhi_epi64(xmm0,xmm1);
+*/
+    seperate_real_imag_parts(&y0r, &y0i, stream0_256i_in[i], stream0_256i_in[i+1]);
+
+    /*
+    // Rearrange desired channel magnitudes
+    xmm2 = ch_mag_128i[i]; // = [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2)]*(2/sqrt(10))
+    xmm3 = ch_mag_128i[i+1]; // = [|h|^2(3),|h|^2(3),|h|^2(4),|h|^2(4)]*(2/sqrt(10))
+    xmm2 = _mm256_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm2 = _mm256_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm2 = _mm256_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm256_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm256_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm256_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    ch_mag_des = _mm256_unpacklo_epi64(xmm2,xmm3);
+    */
+
+    seperate_real_imag_parts(&ch_mag_des, &xmm2, ch_mag_256i[i], ch_mag_256i[i+1]);
+
+    // Rearrange interfering channel magnitudes
+    /*
+    xmm2 = ch_mag_128i_i[i];
+    xmm3 = ch_mag_128i_i[i+1];
+    xmm2 = _mm256_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm2 = _mm256_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm2 = _mm256_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm256_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm256_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm256_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    ch_mag_int  = _mm256_unpacklo_epi64(xmm2,xmm3);
+    */
+
+    seperate_real_imag_parts(&ch_mag_int, &xmm2, ch_mag_256i_i[i], ch_mag_256i_i[i+1]);
+
+    y0r_one_over_sqrt_21   = _mm256_mulhi_epi16(y0r, ONE_OVER_SQRT_42);
+    y0r_three_over_sqrt_21 = _mm256_mulhi_epi16(y0r, THREE_OVER_SQRT_42);
+    y0r_five_over_sqrt_21  = _mm256_mulhi_epi16(y0r, FIVE_OVER_SQRT_42);
+    y0r_five_over_sqrt_21  = _mm256_slli_epi16(y0r_five_over_sqrt_21, 1);
+    y0r_seven_over_sqrt_21 = _mm256_mulhi_epi16(y0r, SEVEN_OVER_SQRT_42);
+    y0r_seven_over_sqrt_21 = _mm256_slli_epi16(y0r_seven_over_sqrt_21, 2); // Q2.14
+
+    y0i_one_over_sqrt_21   = _mm256_mulhi_epi16(y0i, ONE_OVER_SQRT_42);
+    y0i_three_over_sqrt_21 = _mm256_mulhi_epi16(y0i, THREE_OVER_SQRT_42);
+    y0i_five_over_sqrt_21  = _mm256_mulhi_epi16(y0i, FIVE_OVER_SQRT_42);
+    y0i_five_over_sqrt_21  = _mm256_slli_epi16(y0i_five_over_sqrt_21, 1);
+    y0i_seven_over_sqrt_21 = _mm256_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42);
+    y0i_seven_over_sqrt_21 = _mm256_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14
+
+    y0_p_7_1 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_p_7_3 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_p_7_5 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_p_7_7 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_p_5_1 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_p_5_3 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_p_5_5 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_p_5_7 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_p_3_1 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_p_3_3 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_p_3_5 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_p_3_7 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_p_1_1 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_p_1_3 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_p_1_5 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_p_1_7 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);
+
+    y0_m_1_1 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_m_1_3 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_m_1_5 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_m_1_7 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_m_3_1 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_m_3_3 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_m_3_5 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_m_3_7 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_m_5_1 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_m_5_3 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_m_5_5 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_m_5_7 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_m_7_1 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_m_7_3 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_m_7_5 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_m_7_7 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);
+
+    interference_abs_epi16(psi_r_p7_p7, ch_mag_int, a_r_p7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p7_p5, ch_mag_int, a_r_p7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p7_p3, ch_mag_int, a_r_p7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p7_p1, ch_mag_int, a_r_p7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p7_m1, ch_mag_int, a_r_p7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p7_m3, ch_mag_int, a_r_p7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p7_m5, ch_mag_int, a_r_p7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p7_m7, ch_mag_int, a_r_p7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p5_p7, ch_mag_int, a_r_p5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p5_p5, ch_mag_int, a_r_p5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p5_p3, ch_mag_int, a_r_p5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p5_p1, ch_mag_int, a_r_p5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p5_m1, ch_mag_int, a_r_p5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p5_m3, ch_mag_int, a_r_p5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p5_m5, ch_mag_int, a_r_p5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p5_m7, ch_mag_int, a_r_p5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p3_p7, ch_mag_int, a_r_p3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p3_p5, ch_mag_int, a_r_p3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p3_p3, ch_mag_int, a_r_p3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p3_p1, ch_mag_int, a_r_p3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p3_m1, ch_mag_int, a_r_p3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p3_m3, ch_mag_int, a_r_p3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p3_m5, ch_mag_int, a_r_p3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p3_m7, ch_mag_int, a_r_p3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p1_p7, ch_mag_int, a_r_p1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p1_p5, ch_mag_int, a_r_p1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p1_p3, ch_mag_int, a_r_p1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p1_p1, ch_mag_int, a_r_p1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p1_m1, ch_mag_int, a_r_p1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p1_m3, ch_mag_int, a_r_p1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p1_m5, ch_mag_int, a_r_p1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_p1_m7, ch_mag_int, a_r_p1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m1_p7, ch_mag_int, a_r_m1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m1_p5, ch_mag_int, a_r_m1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m1_p3, ch_mag_int, a_r_m1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m1_p1, ch_mag_int, a_r_m1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m1_m1, ch_mag_int, a_r_m1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m1_m3, ch_mag_int, a_r_m1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m1_m5, ch_mag_int, a_r_m1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m1_m7, ch_mag_int, a_r_m1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m3_p7, ch_mag_int, a_r_m3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m3_p5, ch_mag_int, a_r_m3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m3_p3, ch_mag_int, a_r_m3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m3_p1, ch_mag_int, a_r_m3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m3_m1, ch_mag_int, a_r_m3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m3_m3, ch_mag_int, a_r_m3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m3_m5, ch_mag_int, a_r_m3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m3_m7, ch_mag_int, a_r_m3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m5_p7, ch_mag_int, a_r_m5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m5_p5, ch_mag_int, a_r_m5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m5_p3, ch_mag_int, a_r_m5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m5_p1, ch_mag_int, a_r_m5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m5_m1, ch_mag_int, a_r_m5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m5_m3, ch_mag_int, a_r_m5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m5_m5, ch_mag_int, a_r_m5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m5_m7, ch_mag_int, a_r_m5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m7_p7, ch_mag_int, a_r_m7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m7_p5, ch_mag_int, a_r_m7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m7_p3, ch_mag_int, a_r_m7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m7_p1, ch_mag_int, a_r_m7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m7_m1, ch_mag_int, a_r_m7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m7_m3, ch_mag_int, a_r_m7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m7_m5, ch_mag_int, a_r_m7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_r_m7_m7, ch_mag_int, a_r_m7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+
+    interference_abs_epi16(psi_i_p7_p7, ch_mag_int, a_i_p7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p7_p5, ch_mag_int, a_i_p7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p7_p3, ch_mag_int, a_i_p7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p7_p1, ch_mag_int, a_i_p7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p7_m1, ch_mag_int, a_i_p7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p7_m3, ch_mag_int, a_i_p7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p7_m5, ch_mag_int, a_i_p7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p7_m7, ch_mag_int, a_i_p7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p5_p7, ch_mag_int, a_i_p5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p5_p5, ch_mag_int, a_i_p5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p5_p3, ch_mag_int, a_i_p5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p5_p1, ch_mag_int, a_i_p5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p5_m1, ch_mag_int, a_i_p5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p5_m3, ch_mag_int, a_i_p5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p5_m5, ch_mag_int, a_i_p5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p5_m7, ch_mag_int, a_i_p5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p3_p7, ch_mag_int, a_i_p3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p3_p5, ch_mag_int, a_i_p3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p3_p3, ch_mag_int, a_i_p3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p3_p1, ch_mag_int, a_i_p3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p3_m1, ch_mag_int, a_i_p3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p3_m3, ch_mag_int, a_i_p3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p3_m5, ch_mag_int, a_i_p3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p3_m7, ch_mag_int, a_i_p3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p1_p7, ch_mag_int, a_i_p1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p1_p5, ch_mag_int, a_i_p1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p1_p3, ch_mag_int, a_i_p1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p1_p1, ch_mag_int, a_i_p1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p1_m1, ch_mag_int, a_i_p1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p1_m3, ch_mag_int, a_i_p1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p1_m5, ch_mag_int, a_i_p1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_p1_m7, ch_mag_int, a_i_p1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m1_p7, ch_mag_int, a_i_m1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m1_p5, ch_mag_int, a_i_m1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m1_p3, ch_mag_int, a_i_m1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m1_p1, ch_mag_int, a_i_m1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m1_m1, ch_mag_int, a_i_m1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m1_m3, ch_mag_int, a_i_m1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m1_m5, ch_mag_int, a_i_m1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m1_m7, ch_mag_int, a_i_m1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m3_p7, ch_mag_int, a_i_m3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m3_p5, ch_mag_int, a_i_m3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m3_p3, ch_mag_int, a_i_m3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m3_p1, ch_mag_int, a_i_m3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m3_m1, ch_mag_int, a_i_m3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m3_m3, ch_mag_int, a_i_m3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m3_m5, ch_mag_int, a_i_m3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m3_m7, ch_mag_int, a_i_m3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m5_p7, ch_mag_int, a_i_m5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m5_p5, ch_mag_int, a_i_m5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m5_p3, ch_mag_int, a_i_m5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m5_p1, ch_mag_int, a_i_m5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m5_m1, ch_mag_int, a_i_m5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m5_m3, ch_mag_int, a_i_m5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m5_m5, ch_mag_int, a_i_m5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m5_m7, ch_mag_int, a_i_m5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m7_p7, ch_mag_int, a_i_m7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m7_p5, ch_mag_int, a_i_m7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m7_p3, ch_mag_int, a_i_m7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m7_p1, ch_mag_int, a_i_m7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m7_m1, ch_mag_int, a_i_m7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m7_m3, ch_mag_int, a_i_m7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m7_m5, ch_mag_int, a_i_m7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+    interference_abs_epi16(psi_i_m7_m7, ch_mag_int, a_i_m7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
+
+    // Calculation of a group of two terms in the bit metric involving product of psi and interference
+    prodsum_psi_a_epi16(psi_r_p7_p7, a_r_p7_p7, psi_i_p7_p7, a_i_p7_p7, psi_a_p7_p7);
+    prodsum_psi_a_epi16(psi_r_p7_p5, a_r_p7_p5, psi_i_p7_p5, a_i_p7_p5, psi_a_p7_p5);
+    prodsum_psi_a_epi16(psi_r_p7_p3, a_r_p7_p3, psi_i_p7_p3, a_i_p7_p3, psi_a_p7_p3);
+    prodsum_psi_a_epi16(psi_r_p7_p1, a_r_p7_p1, psi_i_p7_p1, a_i_p7_p1, psi_a_p7_p1);
+    prodsum_psi_a_epi16(psi_r_p7_m1, a_r_p7_m1, psi_i_p7_m1, a_i_p7_m1, psi_a_p7_m1);
+    prodsum_psi_a_epi16(psi_r_p7_m3, a_r_p7_m3, psi_i_p7_m3, a_i_p7_m3, psi_a_p7_m3);
+    prodsum_psi_a_epi16(psi_r_p7_m5, a_r_p7_m5, psi_i_p7_m5, a_i_p7_m5, psi_a_p7_m5);
+    prodsum_psi_a_epi16(psi_r_p7_m7, a_r_p7_m7, psi_i_p7_m7, a_i_p7_m7, psi_a_p7_m7);
+    prodsum_psi_a_epi16(psi_r_p5_p7, a_r_p5_p7, psi_i_p5_p7, a_i_p5_p7, psi_a_p5_p7);
+    prodsum_psi_a_epi16(psi_r_p5_p5, a_r_p5_p5, psi_i_p5_p5, a_i_p5_p5, psi_a_p5_p5);
+    prodsum_psi_a_epi16(psi_r_p5_p3, a_r_p5_p3, psi_i_p5_p3, a_i_p5_p3, psi_a_p5_p3);
+    prodsum_psi_a_epi16(psi_r_p5_p1, a_r_p5_p1, psi_i_p5_p1, a_i_p5_p1, psi_a_p5_p1);
+    prodsum_psi_a_epi16(psi_r_p5_m1, a_r_p5_m1, psi_i_p5_m1, a_i_p5_m1, psi_a_p5_m1);
+    prodsum_psi_a_epi16(psi_r_p5_m3, a_r_p5_m3, psi_i_p5_m3, a_i_p5_m3, psi_a_p5_m3);
+    prodsum_psi_a_epi16(psi_r_p5_m5, a_r_p5_m5, psi_i_p5_m5, a_i_p5_m5, psi_a_p5_m5);
+    prodsum_psi_a_epi16(psi_r_p5_m7, a_r_p5_m7, psi_i_p5_m7, a_i_p5_m7, psi_a_p5_m7);
+    prodsum_psi_a_epi16(psi_r_p3_p7, a_r_p3_p7, psi_i_p3_p7, a_i_p3_p7, psi_a_p3_p7);
+    prodsum_psi_a_epi16(psi_r_p3_p5, a_r_p3_p5, psi_i_p3_p5, a_i_p3_p5, psi_a_p3_p5);
+    prodsum_psi_a_epi16(psi_r_p3_p3, a_r_p3_p3, psi_i_p3_p3, a_i_p3_p3, psi_a_p3_p3);
+    prodsum_psi_a_epi16(psi_r_p3_p1, a_r_p3_p1, psi_i_p3_p1, a_i_p3_p1, psi_a_p3_p1);
+    prodsum_psi_a_epi16(psi_r_p3_m1, a_r_p3_m1, psi_i_p3_m1, a_i_p3_m1, psi_a_p3_m1);
+    prodsum_psi_a_epi16(psi_r_p3_m3, a_r_p3_m3, psi_i_p3_m3, a_i_p3_m3, psi_a_p3_m3);
+    prodsum_psi_a_epi16(psi_r_p3_m5, a_r_p3_m5, psi_i_p3_m5, a_i_p3_m5, psi_a_p3_m5);
+    prodsum_psi_a_epi16(psi_r_p3_m7, a_r_p3_m7, psi_i_p3_m7, a_i_p3_m7, psi_a_p3_m7);
+    prodsum_psi_a_epi16(psi_r_p1_p7, a_r_p1_p7, psi_i_p1_p7, a_i_p1_p7, psi_a_p1_p7);
+    prodsum_psi_a_epi16(psi_r_p1_p5, a_r_p1_p5, psi_i_p1_p5, a_i_p1_p5, psi_a_p1_p5);
+    prodsum_psi_a_epi16(psi_r_p1_p3, a_r_p1_p3, psi_i_p1_p3, a_i_p1_p3, psi_a_p1_p3);
+    prodsum_psi_a_epi16(psi_r_p1_p1, a_r_p1_p1, psi_i_p1_p1, a_i_p1_p1, psi_a_p1_p1);
+    prodsum_psi_a_epi16(psi_r_p1_m1, a_r_p1_m1, psi_i_p1_m1, a_i_p1_m1, psi_a_p1_m1);
+    prodsum_psi_a_epi16(psi_r_p1_m3, a_r_p1_m3, psi_i_p1_m3, a_i_p1_m3, psi_a_p1_m3);
+    prodsum_psi_a_epi16(psi_r_p1_m5, a_r_p1_m5, psi_i_p1_m5, a_i_p1_m5, psi_a_p1_m5);
+    prodsum_psi_a_epi16(psi_r_p1_m7, a_r_p1_m7, psi_i_p1_m7, a_i_p1_m7, psi_a_p1_m7);
+    prodsum_psi_a_epi16(psi_r_m1_p7, a_r_m1_p7, psi_i_m1_p7, a_i_m1_p7, psi_a_m1_p7);
+    prodsum_psi_a_epi16(psi_r_m1_p5, a_r_m1_p5, psi_i_m1_p5, a_i_m1_p5, psi_a_m1_p5);
+    prodsum_psi_a_epi16(psi_r_m1_p3, a_r_m1_p3, psi_i_m1_p3, a_i_m1_p3, psi_a_m1_p3);
+    prodsum_psi_a_epi16(psi_r_m1_p1, a_r_m1_p1, psi_i_m1_p1, a_i_m1_p1, psi_a_m1_p1);
+    prodsum_psi_a_epi16(psi_r_m1_m1, a_r_m1_m1, psi_i_m1_m1, a_i_m1_m1, psi_a_m1_m1);
+    prodsum_psi_a_epi16(psi_r_m1_m3, a_r_m1_m3, psi_i_m1_m3, a_i_m1_m3, psi_a_m1_m3);
+    prodsum_psi_a_epi16(psi_r_m1_m5, a_r_m1_m5, psi_i_m1_m5, a_i_m1_m5, psi_a_m1_m5);
+    prodsum_psi_a_epi16(psi_r_m1_m7, a_r_m1_m7, psi_i_m1_m7, a_i_m1_m7, psi_a_m1_m7);
+    prodsum_psi_a_epi16(psi_r_m3_p7, a_r_m3_p7, psi_i_m3_p7, a_i_m3_p7, psi_a_m3_p7);
+    prodsum_psi_a_epi16(psi_r_m3_p5, a_r_m3_p5, psi_i_m3_p5, a_i_m3_p5, psi_a_m3_p5);
+    prodsum_psi_a_epi16(psi_r_m3_p3, a_r_m3_p3, psi_i_m3_p3, a_i_m3_p3, psi_a_m3_p3);
+    prodsum_psi_a_epi16(psi_r_m3_p1, a_r_m3_p1, psi_i_m3_p1, a_i_m3_p1, psi_a_m3_p1);
+    prodsum_psi_a_epi16(psi_r_m3_m1, a_r_m3_m1, psi_i_m3_m1, a_i_m3_m1, psi_a_m3_m1);
+    prodsum_psi_a_epi16(psi_r_m3_m3, a_r_m3_m3, psi_i_m3_m3, a_i_m3_m3, psi_a_m3_m3);
+    prodsum_psi_a_epi16(psi_r_m3_m5, a_r_m3_m5, psi_i_m3_m5, a_i_m3_m5, psi_a_m3_m5);
+    prodsum_psi_a_epi16(psi_r_m3_m7, a_r_m3_m7, psi_i_m3_m7, a_i_m3_m7, psi_a_m3_m7);
+    prodsum_psi_a_epi16(psi_r_m5_p7, a_r_m5_p7, psi_i_m5_p7, a_i_m5_p7, psi_a_m5_p7);
+    prodsum_psi_a_epi16(psi_r_m5_p5, a_r_m5_p5, psi_i_m5_p5, a_i_m5_p5, psi_a_m5_p5);
+    prodsum_psi_a_epi16(psi_r_m5_p3, a_r_m5_p3, psi_i_m5_p3, a_i_m5_p3, psi_a_m5_p3);
+    prodsum_psi_a_epi16(psi_r_m5_p1, a_r_m5_p1, psi_i_m5_p1, a_i_m5_p1, psi_a_m5_p1);
+    prodsum_psi_a_epi16(psi_r_m5_m1, a_r_m5_m1, psi_i_m5_m1, a_i_m5_m1, psi_a_m5_m1);
+    prodsum_psi_a_epi16(psi_r_m5_m3, a_r_m5_m3, psi_i_m5_m3, a_i_m5_m3, psi_a_m5_m3);
+    prodsum_psi_a_epi16(psi_r_m5_m5, a_r_m5_m5, psi_i_m5_m5, a_i_m5_m5, psi_a_m5_m5);
+    prodsum_psi_a_epi16(psi_r_m5_m7, a_r_m5_m7, psi_i_m5_m7, a_i_m5_m7, psi_a_m5_m7);
+    prodsum_psi_a_epi16(psi_r_m7_p7, a_r_m7_p7, psi_i_m7_p7, a_i_m7_p7, psi_a_m7_p7);
+    prodsum_psi_a_epi16(psi_r_m7_p5, a_r_m7_p5, psi_i_m7_p5, a_i_m7_p5, psi_a_m7_p5);
+    prodsum_psi_a_epi16(psi_r_m7_p3, a_r_m7_p3, psi_i_m7_p3, a_i_m7_p3, psi_a_m7_p3);
+    prodsum_psi_a_epi16(psi_r_m7_p1, a_r_m7_p1, psi_i_m7_p1, a_i_m7_p1, psi_a_m7_p1);
+    prodsum_psi_a_epi16(psi_r_m7_m1, a_r_m7_m1, psi_i_m7_m1, a_i_m7_m1, psi_a_m7_m1);
+    prodsum_psi_a_epi16(psi_r_m7_m3, a_r_m7_m3, psi_i_m7_m3, a_i_m7_m3, psi_a_m7_m3);
+    prodsum_psi_a_epi16(psi_r_m7_m5, a_r_m7_m5, psi_i_m7_m5, a_i_m7_m5, psi_a_m7_m5);
+    prodsum_psi_a_epi16(psi_r_m7_m7, a_r_m7_m7, psi_i_m7_m7, a_i_m7_m7, psi_a_m7_m7);
+
+    // Calculation of a group of two terms in the bit metric involving squares of interference
+    square_a_epi16(a_r_p7_p7, a_i_p7_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p7);
+    square_a_epi16(a_r_p7_p5, a_i_p7_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p5);
+    square_a_epi16(a_r_p7_p3, a_i_p7_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p3);
+    square_a_epi16(a_r_p7_p1, a_i_p7_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p1);
+    square_a_epi16(a_r_p7_m1, a_i_p7_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m1);
+    square_a_epi16(a_r_p7_m3, a_i_p7_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m3);
+    square_a_epi16(a_r_p7_m5, a_i_p7_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m5);
+    square_a_epi16(a_r_p7_m7, a_i_p7_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m7);
+    square_a_epi16(a_r_p5_p7, a_i_p5_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p7);
+    square_a_epi16(a_r_p5_p5, a_i_p5_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p5);
+    square_a_epi16(a_r_p5_p3, a_i_p5_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p3);
+    square_a_epi16(a_r_p5_p1, a_i_p5_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p1);
+    square_a_epi16(a_r_p5_m1, a_i_p5_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m1);
+    square_a_epi16(a_r_p5_m3, a_i_p5_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m3);
+    square_a_epi16(a_r_p5_m5, a_i_p5_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m5);
+    square_a_epi16(a_r_p5_m7, a_i_p5_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m7);
+    square_a_epi16(a_r_p3_p7, a_i_p3_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p7);
+    square_a_epi16(a_r_p3_p5, a_i_p3_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p5);
+    square_a_epi16(a_r_p3_p3, a_i_p3_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p3);
+    square_a_epi16(a_r_p3_p1, a_i_p3_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p1);
+    square_a_epi16(a_r_p3_m1, a_i_p3_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m1);
+    square_a_epi16(a_r_p3_m3, a_i_p3_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m3);
+    square_a_epi16(a_r_p3_m5, a_i_p3_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m5);
+    square_a_epi16(a_r_p3_m7, a_i_p3_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m7);
+    square_a_epi16(a_r_p1_p7, a_i_p1_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p7);
+    square_a_epi16(a_r_p1_p5, a_i_p1_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p5);
+    square_a_epi16(a_r_p1_p3, a_i_p1_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p3);
+    square_a_epi16(a_r_p1_p1, a_i_p1_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p1);
+    square_a_epi16(a_r_p1_m1, a_i_p1_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m1);
+    square_a_epi16(a_r_p1_m3, a_i_p1_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m3);
+    square_a_epi16(a_r_p1_m5, a_i_p1_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m5);
+    square_a_epi16(a_r_p1_m7, a_i_p1_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m7);
+    square_a_epi16(a_r_m1_p7, a_i_m1_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p7);
+    square_a_epi16(a_r_m1_p5, a_i_m1_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p5);
+    square_a_epi16(a_r_m1_p3, a_i_m1_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p3);
+    square_a_epi16(a_r_m1_p1, a_i_m1_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p1);
+    square_a_epi16(a_r_m1_m1, a_i_m1_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m1);
+    square_a_epi16(a_r_m1_m3, a_i_m1_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m3);
+    square_a_epi16(a_r_m1_m5, a_i_m1_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m5);
+    square_a_epi16(a_r_m1_m7, a_i_m1_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m7);
+    square_a_epi16(a_r_m3_p7, a_i_m3_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p7);
+    square_a_epi16(a_r_m3_p5, a_i_m3_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p5);
+    square_a_epi16(a_r_m3_p3, a_i_m3_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p3);
+    square_a_epi16(a_r_m3_p1, a_i_m3_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p1);
+    square_a_epi16(a_r_m3_m1, a_i_m3_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m1);
+    square_a_epi16(a_r_m3_m3, a_i_m3_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m3);
+    square_a_epi16(a_r_m3_m5, a_i_m3_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m5);
+    square_a_epi16(a_r_m3_m7, a_i_m3_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m7);
+    square_a_epi16(a_r_m5_p7, a_i_m5_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p7);
+    square_a_epi16(a_r_m5_p5, a_i_m5_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p5);
+    square_a_epi16(a_r_m5_p3, a_i_m5_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p3);
+    square_a_epi16(a_r_m5_p1, a_i_m5_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p1);
+    square_a_epi16(a_r_m5_m1, a_i_m5_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m1);
+    square_a_epi16(a_r_m5_m3, a_i_m5_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m3);
+    square_a_epi16(a_r_m5_m5, a_i_m5_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m5);
+    square_a_epi16(a_r_m5_m7, a_i_m5_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m7);
+    square_a_epi16(a_r_m7_p7, a_i_m7_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p7);
+    square_a_epi16(a_r_m7_p5, a_i_m7_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p5);
+    square_a_epi16(a_r_m7_p3, a_i_m7_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p3);
+    square_a_epi16(a_r_m7_p1, a_i_m7_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p1);
+    square_a_epi16(a_r_m7_m1, a_i_m7_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m1);
+    square_a_epi16(a_r_m7_m3, a_i_m7_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m3);
+    square_a_epi16(a_r_m7_m5, a_i_m7_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m5);
+    square_a_epi16(a_r_m7_m7, a_i_m7_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m7);
+
+    // Computing different multiples of ||h0||^2
+    // x=1, y=1
+    ch_mag_2_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,ONE_OVER_FOUR_SQRT_42);
+    ch_mag_2_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_2_over_42_with_sigma2,1);
+    // x=1, y=3
+    ch_mag_10_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,FIVE_OVER_FOUR_SQRT_42);
+    ch_mag_10_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_10_over_42_with_sigma2,1);
+    // x=1, x=5
+    ch_mag_26_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,THIRTEEN_OVER_FOUR_SQRT_42);
+    ch_mag_26_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_26_over_42_with_sigma2,1);
+    // x=1, y=7
+    ch_mag_50_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
+    ch_mag_50_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
+    // x=3, y=3
+    ch_mag_18_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,NINE_OVER_FOUR_SQRT_42);
+    ch_mag_18_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_18_over_42_with_sigma2,1);
+    // x=3, y=5
+    ch_mag_34_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,SEVENTEEN_OVER_FOUR_SQRT_42);
+    ch_mag_34_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_34_over_42_with_sigma2,1);
+    // x=3, y=7
+    ch_mag_58_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYNINE_OVER_FOUR_SQRT_42);
+    ch_mag_58_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_58_over_42_with_sigma2,2);
+    // x=5, y=5
+    ch_mag_50_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
+    ch_mag_50_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
+    // x=5, y=7
+    ch_mag_74_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,THIRTYSEVEN_OVER_FOUR_SQRT_42);
+    ch_mag_74_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_74_over_42_with_sigma2,2);
+    // x=7, y=7
+    ch_mag_98_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,FORTYNINE_OVER_FOUR_SQRT_42);
+    ch_mag_98_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_98_over_42_with_sigma2,2);
+
+    // Computing Metrics
+    xmm0 = _mm256_subs_epi16(psi_a_p7_p7, a_sq_p7_p7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_7);
+    bit_met_p7_p7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_p5, a_sq_p7_p5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_5);
+    bit_met_p7_p5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_p3, a_sq_p7_p3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_3);
+    bit_met_p7_p3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_p1, a_sq_p7_p1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_1);
+    bit_met_p7_p1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_m1, a_sq_p7_m1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_1);
+    bit_met_p7_m1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_m3, a_sq_p7_m3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_3);
+    bit_met_p7_m3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_m5, a_sq_p7_m5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_5);
+    bit_met_p7_m5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_m7, a_sq_p7_m7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_7);
+    bit_met_p7_m7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_p7, a_sq_p5_p7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_7);
+    bit_met_p5_p7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_p5, a_sq_p5_p5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_5);
+    bit_met_p5_p5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_p3, a_sq_p5_p3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_3);
+    bit_met_p5_p3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_p1, a_sq_p5_p1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_1);
+    bit_met_p5_p1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_m1, a_sq_p5_m1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_1);
+    bit_met_p5_m1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_m3, a_sq_p5_m3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_3);
+    bit_met_p5_m3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_m5, a_sq_p5_m5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_5);
+    bit_met_p5_m5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_m7, a_sq_p5_m7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_7);
+    bit_met_p5_m7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_p7, a_sq_p3_p7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_7);
+    bit_met_p3_p7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_p5, a_sq_p3_p5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_5);
+    bit_met_p3_p5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_p3, a_sq_p3_p3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_3);
+    bit_met_p3_p3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_p1, a_sq_p3_p1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_1);
+    bit_met_p3_p1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_m1, a_sq_p3_m1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_1);
+    bit_met_p3_m1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_m3, a_sq_p3_m3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_3);
+    bit_met_p3_m3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_m5, a_sq_p3_m5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_5);
+    bit_met_p3_m5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_m7, a_sq_p3_m7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_7);
+    bit_met_p3_m7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_p7, a_sq_p1_p7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_7);
+    bit_met_p1_p7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_p5, a_sq_p1_p5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_5);
+    bit_met_p1_p5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_p3, a_sq_p1_p3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_3);
+    bit_met_p1_p3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_p1, a_sq_p1_p1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_1);
+    bit_met_p1_p1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_m1, a_sq_p1_m1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_1);
+    bit_met_p1_m1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_m3, a_sq_p1_m3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_3);
+    bit_met_p1_m3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_m5, a_sq_p1_m5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_5);
+    bit_met_p1_m5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_m7, a_sq_p1_m7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_7);
+    bit_met_p1_m7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+
+    xmm0 = _mm256_subs_epi16(psi_a_m1_p7, a_sq_m1_p7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_7);
+    bit_met_m1_p7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_p5, a_sq_m1_p5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_5);
+    bit_met_m1_p5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_p3, a_sq_m1_p3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_3);
+    bit_met_m1_p3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_p1, a_sq_m1_p1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_1);
+    bit_met_m1_p1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_m1, a_sq_m1_m1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_1);
+    bit_met_m1_m1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_m3, a_sq_m1_m3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_3);
+    bit_met_m1_m3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_m5, a_sq_m1_m5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_5);
+    bit_met_m1_m5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_m7, a_sq_m1_m7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_7);
+    bit_met_m1_m7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_p7, a_sq_m3_p7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_7);
+    bit_met_m3_p7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_p5, a_sq_m3_p5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_5);
+    bit_met_m3_p5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_p3, a_sq_m3_p3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_3);
+    bit_met_m3_p3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_p1, a_sq_m3_p1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_1);
+    bit_met_m3_p1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_m1, a_sq_m3_m1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_1);
+    bit_met_m3_m1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_m3, a_sq_m3_m3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_3);
+    bit_met_m3_m3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_m5, a_sq_m3_m5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_5);
+    bit_met_m3_m5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_m7, a_sq_m3_m7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_7);
+    bit_met_m3_m7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_p7, a_sq_m5_p7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_7);
+    bit_met_m5_p7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_p5, a_sq_m5_p5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_5);
+    bit_met_m5_p5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_p3, a_sq_m5_p3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_3);
+    bit_met_m5_p3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_p1, a_sq_m5_p1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_1);
+    bit_met_m5_p1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_m1, a_sq_m5_m1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_1);
+    bit_met_m5_m1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_m3, a_sq_m5_m3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_3);
+    bit_met_m5_m3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_m5, a_sq_m5_m5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_5);
+    bit_met_m5_m5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_m7, a_sq_m5_m7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_7);
+    bit_met_m5_m7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_p7, a_sq_m7_p7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_7);
+    bit_met_m7_p7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_p5, a_sq_m7_p5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_5);
+    bit_met_m7_p5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_p3, a_sq_m7_p3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_3);
+    bit_met_m7_p3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_p1, a_sq_m7_p1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_1);
+    bit_met_m7_p1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_m1, a_sq_m7_m1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_1);
+    bit_met_m7_m1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_m3, a_sq_m7_m3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_3);
+    bit_met_m7_m3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_m5, a_sq_m7_m5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_5);
+    bit_met_m7_m5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_m7, a_sq_m7_m7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_7);
+    bit_met_m7_m7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
+
+    // Detection for 1st bit (LTE mapping)
+    // bit = 1
+    xmm0 = _mm256_max_epi16(bit_met_m7_p7, bit_met_m7_p5);
+    xmm1 = _mm256_max_epi16(bit_met_m7_p3, bit_met_m7_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m7_m1, bit_met_m7_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m7_m5, bit_met_m7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m5_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m5_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m5_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m3_p7, bit_met_m3_p5);
+    xmm1 = _mm256_max_epi16(bit_met_m3_p3, bit_met_m3_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m3_m1, bit_met_m3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m3_m5, bit_met_m3_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m1_p5);
+    xmm1 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m1_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m1_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    // bit = 0
+    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p7_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p7_p1);
+    xmm2 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p7_m3);
+    xmm3 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p5_p7, bit_met_p5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p5_p3, bit_met_p5_p1);
+    xmm2 = _mm256_max_epi16(bit_met_p5_m1, bit_met_p5_m3);
+    xmm3 = _mm256_max_epi16(bit_met_p5_m5, bit_met_p5_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p3_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p3_p1);
+    xmm2 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p3_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p1_p7, bit_met_p1_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p1_p3, bit_met_p1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_p1_m1, bit_met_p1_m3);
+    xmm3 = _mm256_max_epi16(bit_met_p1_m5, bit_met_p1_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y0r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // Detection for 2nd bit (LTE mapping)
+    // bit = 1
+    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    // bit = 0
+    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y1r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // Detection for 3rd bit (LTE mapping)
+    xmm0 = _mm256_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    xmm0 = _mm256_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y2r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // Detection for 4th bit (LTE mapping)
+    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y0i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+
+    // Detection for 5th bit (LTE mapping)
+    xmm0 = _mm256_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    xmm0 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y1i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // Detection for 6th bit (LTE mapping)
+    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y2i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
+    // RE 1
+    j = 48*i;
+    stream0_out[j + 0] = ((short *)&y0r)[0];
+    stream0_out[j + 1] = ((short *)&y1r)[0];
+    stream0_out[j + 2] = ((short *)&y2r)[0];
+    stream0_out[j + 3] = ((short *)&y0i)[0];
+    stream0_out[j + 4] = ((short *)&y1i)[0];
+    stream0_out[j + 5] = ((short *)&y2i)[0];
+    // RE 2
+    stream0_out[j + 6] = ((short *)&y0r)[1];
+    stream0_out[j + 7] = ((short *)&y1r)[1];
+    stream0_out[j + 8] = ((short *)&y2r)[1];
+    stream0_out[j + 9] = ((short *)&y0i)[1];
+    stream0_out[j + 10] = ((short *)&y1i)[1];
+    stream0_out[j + 11] = ((short *)&y2i)[1];
+    // RE 3
+    stream0_out[j + 12] = ((short *)&y0r)[2];
+    stream0_out[j + 13] = ((short *)&y1r)[2];
+    stream0_out[j + 14] = ((short *)&y2r)[2];
+    stream0_out[j + 15] = ((short *)&y0i)[2];
+    stream0_out[j + 16] = ((short *)&y1i)[2];
+    stream0_out[j + 17] = ((short *)&y2i)[2];
+    // RE 4
+    stream0_out[j + 18] = ((short *)&y0r)[3];
+    stream0_out[j + 19] = ((short *)&y1r)[3];
+    stream0_out[j + 20] = ((short *)&y2r)[3];
+    stream0_out[j + 21] = ((short *)&y0i)[3];
+    stream0_out[j + 22] = ((short *)&y1i)[3];
+    stream0_out[j + 23] = ((short *)&y2i)[3];
+    // RE 5
+    stream0_out[j + 24] = ((short *)&y0r)[4];
+    stream0_out[j + 25] = ((short *)&y1r)[4];
+    stream0_out[j + 26] = ((short *)&y2r)[4];
+    stream0_out[j + 27] = ((short *)&y0i)[4];
+    stream0_out[j + 28] = ((short *)&y1i)[4];
+    stream0_out[j + 29] = ((short *)&y2i)[4];
+    // RE 6
+    stream0_out[j + 30] = ((short *)&y0r)[5];
+    stream0_out[j + 31] = ((short *)&y1r)[5];
+    stream0_out[j + 32] = ((short *)&y2r)[5];
+    stream0_out[j + 33] = ((short *)&y0i)[5];
+    stream0_out[j + 34] = ((short *)&y1i)[5];
+    stream0_out[j + 35] = ((short *)&y2i)[5];
+    // RE 7
+    stream0_out[j + 36] = ((short *)&y0r)[6];
+    stream0_out[j + 37] = ((short *)&y1r)[6];
+    stream0_out[j + 38] = ((short *)&y2r)[6];
+    stream0_out[j + 39] = ((short *)&y0i)[6];
+    stream0_out[j + 40] = ((short *)&y1i)[6];
+    stream0_out[j + 41] = ((short *)&y2i)[6];
+    // RE 8
+    stream0_out[j + 42] = ((short *)&y0r)[7];
+    stream0_out[j + 43] = ((short *)&y1r)[7];
+    stream0_out[j + 44] = ((short *)&y2r)[7];
+    stream0_out[j + 45] = ((short *)&y0i)[7];
+    stream0_out[j + 46] = ((short *)&y1i)[7];
+    stream0_out[j + 47] = ((short *)&y2i)[7];
+
+    // RE 9
+    stream0_out[j + 48] = ((short *)&y0r)[8];
+    stream0_out[j + 49] = ((short *)&y1r)[8];
+    stream0_out[j + 50] = ((short *)&y2r)[8];
+    stream0_out[j + 51] = ((short *)&y0i)[8];
+    stream0_out[j + 52] = ((short *)&y1i)[8];
+    stream0_out[j + 53] = ((short *)&y2i)[8];
+    // RE 10
+    stream0_out[j + 54] = ((short *)&y0r)[9];
+    stream0_out[j + 55] = ((short *)&y1r)[9];
+    stream0_out[j + 56] = ((short *)&y2r)[9];
+    stream0_out[j + 57] = ((short *)&y0i)[9];
+    stream0_out[j + 58] = ((short *)&y1i)[9];
+    stream0_out[j + 59] = ((short *)&y2i)[9];
+    // RE 11
+    stream0_out[j + 60] = ((short *)&y0r)[10];
+    stream0_out[j + 61] = ((short *)&y1r)[10];
+    stream0_out[j + 62] = ((short *)&y2r)[10];
+    stream0_out[j + 63] = ((short *)&y0i)[10];
+    stream0_out[j + 64] = ((short *)&y1i)[10];
+    stream0_out[j + 65] = ((short *)&y2i)[10];
+    // RE 12
+    stream0_out[j + 66] = ((short *)&y0r)[11];
+    stream0_out[j + 67] = ((short *)&y1r)[11];
+    stream0_out[j + 68] = ((short *)&y2r)[11];
+    stream0_out[j + 69] = ((short *)&y0i)[11];
+    stream0_out[j + 70] = ((short *)&y1i)[11];
+    stream0_out[j + 71] = ((short *)&y2i)[11];
+    // RE 13
+    stream0_out[j + 72] = ((short *)&y0r)[12];
+    stream0_out[j + 73] = ((short *)&y1r)[12];
+    stream0_out[j + 74] = ((short *)&y2r)[12];
+    stream0_out[j + 75] = ((short *)&y0i)[12];
+    stream0_out[j + 76] = ((short *)&y1i)[12];
+    stream0_out[j + 77] = ((short *)&y2i)[12];
+    // RE 14
+    stream0_out[j + 78] = ((short *)&y0r)[13];
+    stream0_out[j + 79] = ((short *)&y1r)[13];
+    stream0_out[j + 80] = ((short *)&y2r)[13];
+    stream0_out[j + 81] = ((short *)&y0i)[13];
+    stream0_out[j + 82] = ((short *)&y1i)[13];
+    stream0_out[j + 83] = ((short *)&y2i)[13];
+    // RE 15
+    stream0_out[j + 84] = ((short *)&y0r)[14];
+    stream0_out[j + 85] = ((short *)&y1r)[14];
+    stream0_out[j + 86] = ((short *)&y2r)[14];
+    stream0_out[j + 87] = ((short *)&y0i)[14];
+    stream0_out[j + 88] = ((short *)&y1i)[14];
+    stream0_out[j + 89] = ((short *)&y2i)[14];
+    // RE 16
+    stream0_out[j + 90] = ((short *)&y0r)[15];
+    stream0_out[j + 91] = ((short *)&y1r)[15];
+    stream0_out[j + 92] = ((short *)&y2r)[15];
+    stream0_out[j + 93] = ((short *)&y0i)[15];
+    stream0_out[j + 94] = ((short *)&y1i)[15];
+    stream0_out[j + 95] = ((short *)&y2i)[15];
+
+#elif defined(__arm__)
+
+#endif
+  }
+
+#if defined(__x86_64__) || defined(__i386__)
+  _mm_empty();
+  _m_empty();
+#endif
+
+}
+
+void qam64_qam64_avx2(int32_t *stream0_in,
+                      int32_t *stream1_in,
+                      int32_t *ch_mag,
+                      int32_t *ch_mag_i,
+                      int16_t *stream0_out,
+                      int32_t *rho01,
+                      int length
+    )
+{
+
+  /*
+    Author: S. Wagner
+    Date: 28-02-17
+
+    Input:
+    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
+    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
+    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
+    rho01:       Channel cross correlation, i.e., h1'*h0
+
+    Output:
+    stream0_out: output LLRs for 1st stream
+  */
+
+#if defined(__x86_64__) || defined(__i386__)
+
+  __m256i *rho01_256i      = (__m256i *)rho01;
+  __m256i *stream0_256i_in = (__m256i *)stream0_in;
+  __m256i *stream1_256i_in = (__m256i *)stream1_in;
+  __m256i *ch_mag_256i     = (__m256i *)ch_mag;
+  __m256i *ch_mag_256i_i   = (__m256i *)ch_mag_i;
+
+  __m256i ONE_OVER_SQRT_42              = _mm256_broadcastw_epi16(_mm_set1_epi16(10112)); // round(1/sqrt(42)*2^16)
+  __m256i THREE_OVER_SQRT_42            = _mm256_broadcastw_epi16(_mm_set1_epi16(30337)); // round(3/sqrt(42)*2^16)
+  __m256i FIVE_OVER_SQRT_42             = _mm256_broadcastw_epi16(_mm_set1_epi16(25281)); // round(5/sqrt(42)*2^15)
+  __m256i SEVEN_OVER_SQRT_42            = _mm256_broadcastw_epi16(_mm_set1_epi16(17697)); // round(7/sqrt(42)*2^14) Q2.14
+  __m256i ONE_OVER_SQRT_2               = _mm256_broadcastw_epi16(_mm_set1_epi16(23170)); // round(1/sqrt(2)*2^15)
+  __m256i ONE_OVER_SQRT_2_42            = _mm256_broadcastw_epi16(_mm_set1_epi16(3575));  // round(1/sqrt(2*42)*2^15)
+  __m256i THREE_OVER_SQRT_2_42          = _mm256_broadcastw_epi16(_mm_set1_epi16(10726)); // round(3/sqrt(2*42)*2^15)
+  __m256i FIVE_OVER_SQRT_2_42           = _mm256_broadcastw_epi16(_mm_set1_epi16(17876)); // round(5/sqrt(2*42)*2^15)
+  __m256i SEVEN_OVER_SQRT_2_42          = _mm256_broadcastw_epi16(_mm_set1_epi16(25027)); // round(7/sqrt(2*42)*2^15)
+  __m256i FORTYNINE_OVER_FOUR_SQRT_42   = _mm256_broadcastw_epi16(_mm_set1_epi16(30969)); // round(49/(4*sqrt(42))*2^14), Q2.14
+  __m256i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(23385)); // round(37/(4*sqrt(42))*2^14), Q2.14
+  __m256i TWENTYFIVE_OVER_FOUR_SQRT_42  = _mm256_broadcastw_epi16(_mm_set1_epi16(31601)); // round(25/(4*sqrt(42))*2^15)
+  __m256i TWENTYNINE_OVER_FOUR_SQRT_42  = _mm256_broadcastw_epi16(_mm_set1_epi16(18329)); // round(29/(4*sqrt(42))*2^15), Q2.14
+  __m256i SEVENTEEN_OVER_FOUR_SQRT_42   = _mm256_broadcastw_epi16(_mm_set1_epi16(21489)); // round(17/(4*sqrt(42))*2^15)
+  __m256i NINE_OVER_FOUR_SQRT_42        = _mm256_broadcastw_epi16(_mm_set1_epi16(11376)); // round(9/(4*sqrt(42))*2^15)
+  __m256i THIRTEEN_OVER_FOUR_SQRT_42    = _mm256_broadcastw_epi16(_mm_set1_epi16(16433)); // round(13/(4*sqrt(42))*2^15)
+  __m256i FIVE_OVER_FOUR_SQRT_42        = _mm256_broadcastw_epi16(_mm_set1_epi16(6320));  // round(5/(4*sqrt(42))*2^15)
+  __m256i ONE_OVER_FOUR_SQRT_42         = _mm256_broadcastw_epi16(_mm_set1_epi16(1264));  // round(1/(4*sqrt(42))*2^15)
+  __m256i SQRT_42_OVER_FOUR             = _mm256_broadcastw_epi16(_mm_set1_epi16(13272)); // round(sqrt(42)/4*2^13), Q3.12
+
+  __m256i ch_mag_des;
+  __m256i ch_mag_int;
+  __m256i ch_mag_98_over_42_with_sigma2;
+  __m256i ch_mag_74_over_42_with_sigma2;
+  __m256i ch_mag_58_over_42_with_sigma2;
+  __m256i ch_mag_50_over_42_with_sigma2;
+  __m256i ch_mag_34_over_42_with_sigma2;
+  __m256i ch_mag_18_over_42_with_sigma2;
+  __m256i ch_mag_26_over_42_with_sigma2;
+  __m256i ch_mag_10_over_42_with_sigma2;
+  __m256i ch_mag_2_over_42_with_sigma2;
+  __m256i y0r_one_over_sqrt_21;
+  __m256i y0r_three_over_sqrt_21;
+  __m256i y0r_five_over_sqrt_21;
+  __m256i y0r_seven_over_sqrt_21;
+  __m256i y0i_one_over_sqrt_21;
+  __m256i y0i_three_over_sqrt_21;
+  __m256i y0i_five_over_sqrt_21;
+  __m256i y0i_seven_over_sqrt_21;
+  __m256i ch_mag_int_with_sigma2;
+  __m256i two_ch_mag_int_with_sigma2;
+  __m256i three_ch_mag_int_with_sigma2;
+#elif defined(__arm__)
+
+#endif
+
+  int i,j;
+  uint32_t len256 = (length)>>3;
+
+  for (i=0; i<len256; i+=2) {
+
+#if defined(__x86_64__) || defined(__i386__)
+
+    // Get rho
+      /*
+    xmm0 = rho01_256i[i];
+    xmm1 = rho01_256i[i+1];
+    xmm0 = _mm256_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm256_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm256_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+
+    xmm1 = _mm256_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+
+    //xmm0 = [Re(0,1,2,3)   Im(0,1,2,3)   Re(4,5,6,7)     Im(4,5,6,7)]
+    //xmm0 = [Re(8,9,10,11) Im(8,9,10,11) Re(12,13,14,15) Im(12,13,14,15)]
+
+    xmm2 = _mm256_unpacklo_epi64(xmm0, xmm1);
+    //xmm2 = [Re(0,1,2,3) Re(8,9,10,11) Re(4,5,6,7) Re(12,13,14,15)]
+    xmm2 = _mm256_permute4x64_epi64(xmm2,0xd8); // Re(rho)
+
+    xmm3 = _mm256_unpackhi_epi64(xmm0, xmm1);
+    //xmm3 = [Im(0,1,2,3) Im(8,9,10,11) Im(4,5,6,7) Im(12,13,14,15)]
+    xmm3 = _mm256_permute4x64_epi64(xmm3,0xd8); // Im(rho)
+      */
+
+    seperate_real_imag_parts(&xmm2, &xmm3, rho01_256i[i], rho01_256i[i+1]);
+
+    rho_rpi = _mm256_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
+    rho_rmi = _mm256_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)
+
+    // Compute the different rhos
+    rho_rpi_1_1 = _mm256_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42);
+    rho_rmi_1_1 = _mm256_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42);
+    rho_rpi_3_3 = _mm256_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42);
+    rho_rmi_3_3 = _mm256_mulhi_epi16(rho_rmi, THREE_OVER_SQRT_42);
+    rho_rpi_5_5 = _mm256_mulhi_epi16(rho_rpi, FIVE_OVER_SQRT_42);
+    rho_rmi_5_5 = _mm256_mulhi_epi16(rho_rmi, FIVE_OVER_SQRT_42);
+    rho_rpi_7_7 = _mm256_mulhi_epi16(rho_rpi, SEVEN_OVER_SQRT_42);
+    rho_rmi_7_7 = _mm256_mulhi_epi16(rho_rmi, SEVEN_OVER_SQRT_42);
+
+    rho_rpi_5_5 = _mm256_slli_epi16(rho_rpi_5_5, 1);
+    rho_rmi_5_5 = _mm256_slli_epi16(rho_rmi_5_5, 1);
+    rho_rpi_7_7 = _mm256_slli_epi16(rho_rpi_7_7, 2);
+    rho_rmi_7_7 = _mm256_slli_epi16(rho_rmi_7_7, 2);
+
+    xmm4 = _mm256_mulhi_epi16(xmm2, ONE_OVER_SQRT_42);
+    xmm5 = _mm256_mulhi_epi16(xmm3, ONE_OVER_SQRT_42);
+    xmm6 = _mm256_mulhi_epi16(xmm3, THREE_OVER_SQRT_42);
+    xmm7 = _mm256_mulhi_epi16(xmm3, FIVE_OVER_SQRT_42);
+    xmm8 = _mm256_mulhi_epi16(xmm3, SEVEN_OVER_SQRT_42);
+    xmm7 = _mm256_slli_epi16(xmm7, 1);
+    xmm8 = _mm256_slli_epi16(xmm8, 2);
+
+    rho_rpi_1_3 = _mm256_adds_epi16(xmm4, xmm6);
+    rho_rmi_1_3 = _mm256_subs_epi16(xmm4, xmm6);
+    rho_rpi_1_5 = _mm256_adds_epi16(xmm4, xmm7);
+    rho_rmi_1_5 = _mm256_subs_epi16(xmm4, xmm7);
+    rho_rpi_1_7 = _mm256_adds_epi16(xmm4, xmm8);
+    rho_rmi_1_7 = _mm256_subs_epi16(xmm4, xmm8);
+
+    xmm4 = _mm256_mulhi_epi16(xmm2, THREE_OVER_SQRT_42);
+    rho_rpi_3_1 = _mm256_adds_epi16(xmm4, xmm5);
+    rho_rmi_3_1 = _mm256_subs_epi16(xmm4, xmm5);
+    rho_rpi_3_5 = _mm256_adds_epi16(xmm4, xmm7);
+    rho_rmi_3_5 = _mm256_subs_epi16(xmm4, xmm7);
+    rho_rpi_3_7 = _mm256_adds_epi16(xmm4, xmm8);
+    rho_rmi_3_7 = _mm256_subs_epi16(xmm4, xmm8);
+
+    xmm4 = _mm256_mulhi_epi16(xmm2, FIVE_OVER_SQRT_42);
+    xmm4 = _mm256_slli_epi16(xmm4, 1);
+    rho_rpi_5_1 = _mm256_adds_epi16(xmm4, xmm5);
+    rho_rmi_5_1 = _mm256_subs_epi16(xmm4, xmm5);
+    rho_rpi_5_3 = _mm256_adds_epi16(xmm4, xmm6);
+    rho_rmi_5_3 = _mm256_subs_epi16(xmm4, xmm6);
+    rho_rpi_5_7 = _mm256_adds_epi16(xmm4, xmm8);
+    rho_rmi_5_7 = _mm256_subs_epi16(xmm4, xmm8);
+
+    xmm4 = _mm256_mulhi_epi16(xmm2, SEVEN_OVER_SQRT_42);
+    xmm4 = _mm256_slli_epi16(xmm4, 2);
+    rho_rpi_7_1 = _mm256_adds_epi16(xmm4, xmm5);
+    rho_rmi_7_1 = _mm256_subs_epi16(xmm4, xmm5);
+    rho_rpi_7_3 = _mm256_adds_epi16(xmm4, xmm6);
+    rho_rmi_7_3 = _mm256_subs_epi16(xmm4, xmm6);
+    rho_rpi_7_5 = _mm256_adds_epi16(xmm4, xmm7);
+    rho_rmi_7_5 = _mm256_subs_epi16(xmm4, xmm7);
+
+    // Rearrange interfering MF output
+    /*
+    xmm0 = stream1_256i_in[i];
+    xmm1 = stream1_256i_in[i+1];
+    xmm0 = _mm256_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm256_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm256_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+
+    xmm1 = _mm256_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm256_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+
+    y1r = _mm256_unpacklo_epi64(xmm0, xmm1);
+    y1r = _mm256_permute4x64_epi64(y1r,0xd8); // Re(y1)
+
+    y1i = _mm256_unpackhi_epi64(xmm0, xmm1);
+    y1i = _mm256_permute4x64_epi64(y1i,0xd8); // Im(y1)
+    */
+
+    seperate_real_imag_parts(&y1r, &y1i, stream1_256i_in[i], stream1_256i_in[i+1]);
+
+    // Psi_r calculation from rho_rpi or rho_rmi
+    xmm0 = _mm256_broadcastw_epi16(_mm_set1_epi16(0));// ZERO for abs_pi16
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_7, y1r);
+
+    psi_r_p7_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_5, y1r);
+    psi_r_p7_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_3, y1r);
+    psi_r_p7_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_1, y1r);
+    psi_r_p7_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_1, y1r);
+    psi_r_p7_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_3, y1r);
+    psi_r_p7_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_5, y1r);
+    psi_r_p7_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_7, y1r);
+    psi_r_p7_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_7, y1r);
+    psi_r_p5_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_5, y1r);
+    psi_r_p5_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_3, y1r);
+    psi_r_p5_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_1, y1r);
+    psi_r_p5_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_1, y1r);
+    psi_r_p5_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_3, y1r);
+    psi_r_p5_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_5, y1r);
+    psi_r_p5_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_7, y1r);
+    psi_r_p5_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_7, y1r);
+    psi_r_p3_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_5, y1r);
+    psi_r_p3_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_3, y1r);
+    psi_r_p3_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_1, y1r);
+    psi_r_p3_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_1, y1r);
+    psi_r_p3_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_3, y1r);
+    psi_r_p3_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_5, y1r);
+    psi_r_p3_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_7, y1r);
+    psi_r_p3_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_7, y1r);
+    psi_r_p1_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_5, y1r);
+    psi_r_p1_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_3, y1r);
+    psi_r_p1_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_1, y1r);
+    psi_r_p1_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_1, y1r);
+    psi_r_p1_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_3, y1r);
+    psi_r_p1_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_5, y1r);
+    psi_r_p1_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_7, y1r);
+    psi_r_p1_m7 = _mm256_abs_epi16(xmm2);
+
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_7, y1r);
+    psi_r_m1_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_5, y1r);
+    psi_r_m1_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_3, y1r);
+    psi_r_m1_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_1, y1r);
+    psi_r_m1_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_1, y1r);
+    psi_r_m1_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_3, y1r);
+    psi_r_m1_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_5, y1r);
+    psi_r_m1_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_7, y1r);
+    psi_r_m1_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_7, y1r);
+    psi_r_m3_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_5, y1r);
+    psi_r_m3_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_3, y1r);
+    psi_r_m3_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_1, y1r);
+    psi_r_m3_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_1, y1r);
+    psi_r_m3_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_3, y1r);
+    psi_r_m3_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_5, y1r);
+    psi_r_m3_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_7, y1r);
+    psi_r_m3_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_7, y1r);
+    psi_r_m5_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_5, y1r);
+    psi_r_m5_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_3, y1r);
+    psi_r_m5_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_1, y1r);
+    psi_r_m5_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_1, y1r);
+    psi_r_m5_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_3, y1r);
+    psi_r_m5_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_5, y1r);
+    psi_r_m5_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_7, y1r);
+    psi_r_m5_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_7, y1r);
+    psi_r_m7_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_5, y1r);
+    psi_r_m7_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_3, y1r);
+    psi_r_m7_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_1, y1r);
+    psi_r_m7_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_1, y1r);
+    psi_r_m7_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_3, y1r);
+    psi_r_m7_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_5, y1r);
+    psi_r_m7_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_7, y1r);
+    psi_r_m7_m7 = _mm256_abs_epi16(xmm2);
+
+    // Psi_i calculation from rho_rpi or rho_rmi
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_7, y1i);
+    psi_i_p7_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_7, y1i);
+    psi_i_p7_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_7, y1i);
+    psi_i_p7_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_7, y1i);
+    psi_i_p7_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_7, y1i);
+    psi_i_p7_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_7, y1i);
+    psi_i_p7_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_7, y1i);
+    psi_i_p7_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_7, y1i);
+    psi_i_p7_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_5, y1i);
+    psi_i_p5_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_5, y1i);
+    psi_i_p5_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_5, y1i);
+    psi_i_p5_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_5, y1i);
+    psi_i_p5_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_5, y1i);
+    psi_i_p5_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_5, y1i);
+    psi_i_p5_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_5, y1i);
+    psi_i_p5_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_5, y1i);
+    psi_i_p5_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_3, y1i);
+    psi_i_p3_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_3, y1i);
+    psi_i_p3_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_3, y1i);
+    psi_i_p3_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_3, y1i);
+    psi_i_p3_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_3, y1i);
+    psi_i_p3_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_3, y1i);
+    psi_i_p3_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_3, y1i);
+    psi_i_p3_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_3, y1i);
+    psi_i_p3_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_7_1, y1i);
+    psi_i_p1_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_5_1, y1i);
+    psi_i_p1_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_3_1, y1i);
+    psi_i_p1_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rmi_1_1, y1i);
+    psi_i_p1_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_1_1, y1i);
+    psi_i_p1_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_3_1, y1i);
+    psi_i_p1_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_5_1, y1i);
+    psi_i_p1_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rpi_7_1, y1i);
+    psi_i_p1_m7 = _mm256_abs_epi16(xmm2);
+
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_1, y1i);
+    psi_i_m1_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_1, y1i);
+    psi_i_m1_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_1, y1i);
+    psi_i_m1_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_1, y1i);
+    psi_i_m1_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_1, y1i);
+    psi_i_m1_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_1, y1i);
+    psi_i_m1_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_1, y1i);
+    psi_i_m1_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_1, y1i);
+    psi_i_m1_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_3, y1i);
+    psi_i_m3_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_3, y1i);
+    psi_i_m3_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_3, y1i);
+    psi_i_m3_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_3, y1i);
+    psi_i_m3_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_3, y1i);
+    psi_i_m3_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_3, y1i);
+    psi_i_m3_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_3, y1i);
+    psi_i_m3_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_3, y1i);
+    psi_i_m3_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_5, y1i);
+    psi_i_m5_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_5, y1i);
+    psi_i_m5_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_5, y1i);
+    psi_i_m5_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_5, y1i);
+    psi_i_m5_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_5, y1i);
+    psi_i_m5_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_5, y1i);
+    psi_i_m5_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_5, y1i);
+    psi_i_m5_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_5, y1i);
+    psi_i_m5_m7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_7_7, y1i);
+    psi_i_m7_p7 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_5_7, y1i);
+    psi_i_m7_p5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_3_7, y1i);
+    psi_i_m7_p3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_subs_epi16(rho_rpi_1_7, y1i);
+    psi_i_m7_p1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_1_7, y1i);
+    psi_i_m7_m1 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_3_7, y1i);
+    psi_i_m7_m3 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_5_7, y1i);
+    psi_i_m7_m5 = _mm256_abs_epi16(xmm2);
+    xmm2 = _mm256_adds_epi16(rho_rmi_7_7, y1i);
+    psi_i_m7_m7 = _mm256_abs_epi16(xmm2);
+
+    /*
+    // Rearrange desired MF output
+    xmm0 = stream0_256i_in[i];
+    xmm1 = stream0_256i_in[i+1];
+    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
+    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
+    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
+    y0i = _mm_unpackhi_epi64(xmm0,xmm1);
+    */
+    seperate_real_imag_parts(&y0r, &y0i, stream0_256i_in[i], stream0_256i_in[i+1]);
+
+    // Rearrange desired channel magnitudes
+    // [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2),...,,|h|^2(7),|h|^2(7)]*(2/sqrt(10))
+    /*
+    xmm2 = ch_mag_256i[i];
+    xmm3 = ch_mag_256i[i+1];
+    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    ch_mag_des = _mm_unpacklo_epi64(xmm2,xmm3);
+    */
+    // xmm2 is dummy variable that contains the same values as ch_mag_des
+    seperate_real_imag_parts(&ch_mag_des, &xmm2, ch_mag_256i[i], ch_mag_256i[i+1]);
+
+
+    // Rearrange interfering channel magnitudes
+    /*
+    xmm2 = ch_mag_256i_i[i];
+    xmm3 = ch_mag_256i_i[i+1];
+    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
+    ch_mag_int  = _mm_unpacklo_epi64(xmm2,xmm3);
+    */
+    seperate_real_imag_parts(&ch_mag_int, &xmm2, ch_mag_256i_i[i], ch_mag_256i_i[i+1]);
+
+    y0r_one_over_sqrt_21   = _mm256_mulhi_epi16(y0r, ONE_OVER_SQRT_42);
+    y0r_three_over_sqrt_21 = _mm256_mulhi_epi16(y0r, THREE_OVER_SQRT_42);
+    y0r_five_over_sqrt_21  = _mm256_mulhi_epi16(y0r, FIVE_OVER_SQRT_42);
+    y0r_five_over_sqrt_21  = _mm256_slli_epi16(y0r_five_over_sqrt_21, 1);
+    y0r_seven_over_sqrt_21 = _mm256_mulhi_epi16(y0r, SEVEN_OVER_SQRT_42);
+    y0r_seven_over_sqrt_21 = _mm256_slli_epi16(y0r_seven_over_sqrt_21, 2); // Q2.14
+
+    y0i_one_over_sqrt_21   = _mm256_mulhi_epi16(y0i, ONE_OVER_SQRT_42);
+    y0i_three_over_sqrt_21 = _mm256_mulhi_epi16(y0i, THREE_OVER_SQRT_42);
+    y0i_five_over_sqrt_21  = _mm256_mulhi_epi16(y0i, FIVE_OVER_SQRT_42);
+    y0i_five_over_sqrt_21  = _mm256_slli_epi16(y0i_five_over_sqrt_21, 1);
+    y0i_seven_over_sqrt_21 = _mm256_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42);
+    y0i_seven_over_sqrt_21 = _mm256_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14
+
+
+    y0_p_7_1 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_p_7_3 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_p_7_5 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_p_7_7 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_p_5_1 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_p_5_3 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_p_5_5 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_p_5_7 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_p_3_1 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_p_3_3 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_p_3_5 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_p_3_7 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_p_1_1 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_p_1_3 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_p_1_5 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_p_1_7 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);
+
+    y0_m_1_1 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_m_1_3 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_m_1_5 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_m_1_7 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_m_3_1 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_m_3_3 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_m_3_5 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_m_3_7 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_m_5_1 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_m_5_3 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_m_5_5 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_m_5_7 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
+    y0_m_7_1 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
+    y0_m_7_3 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
+    y0_m_7_5 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
+    y0_m_7_7 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);
+
+    // Detection of interference term
+    ch_mag_int_with_sigma2       = _mm256_srai_epi16(ch_mag_int, 1); // *2
+    two_ch_mag_int_with_sigma2   = ch_mag_int; // *4
+    three_ch_mag_int_with_sigma2 = _mm256_adds_epi16(ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2); // *6
+
+    interference_abs_64qam_epi16(psi_r_p7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_p1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_r_m7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+
+    interference_abs_64qam_epi16(psi_i_p7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_p1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+    interference_abs_64qam_epi16(psi_i_m7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
+                                 SEVEN_OVER_SQRT_2_42);
+
+    // Calculation of a group of two terms in the bit metric involving product of psi and interference
+    prodsum_psi_a_epi16(psi_r_p7_p7, a_r_p7_p7, psi_i_p7_p7, a_i_p7_p7, psi_a_p7_p7);
+    prodsum_psi_a_epi16(psi_r_p7_p5, a_r_p7_p5, psi_i_p7_p5, a_i_p7_p5, psi_a_p7_p5);
+    prodsum_psi_a_epi16(psi_r_p7_p3, a_r_p7_p3, psi_i_p7_p3, a_i_p7_p3, psi_a_p7_p3);
+    prodsum_psi_a_epi16(psi_r_p7_p1, a_r_p7_p1, psi_i_p7_p1, a_i_p7_p1, psi_a_p7_p1);
+    prodsum_psi_a_epi16(psi_r_p7_m1, a_r_p7_m1, psi_i_p7_m1, a_i_p7_m1, psi_a_p7_m1);
+    prodsum_psi_a_epi16(psi_r_p7_m3, a_r_p7_m3, psi_i_p7_m3, a_i_p7_m3, psi_a_p7_m3);
+    prodsum_psi_a_epi16(psi_r_p7_m5, a_r_p7_m5, psi_i_p7_m5, a_i_p7_m5, psi_a_p7_m5);
+    prodsum_psi_a_epi16(psi_r_p7_m7, a_r_p7_m7, psi_i_p7_m7, a_i_p7_m7, psi_a_p7_m7);
+    prodsum_psi_a_epi16(psi_r_p5_p7, a_r_p5_p7, psi_i_p5_p7, a_i_p5_p7, psi_a_p5_p7);
+    prodsum_psi_a_epi16(psi_r_p5_p5, a_r_p5_p5, psi_i_p5_p5, a_i_p5_p5, psi_a_p5_p5);
+    prodsum_psi_a_epi16(psi_r_p5_p3, a_r_p5_p3, psi_i_p5_p3, a_i_p5_p3, psi_a_p5_p3);
+    prodsum_psi_a_epi16(psi_r_p5_p1, a_r_p5_p1, psi_i_p5_p1, a_i_p5_p1, psi_a_p5_p1);
+    prodsum_psi_a_epi16(psi_r_p5_m1, a_r_p5_m1, psi_i_p5_m1, a_i_p5_m1, psi_a_p5_m1);
+    prodsum_psi_a_epi16(psi_r_p5_m3, a_r_p5_m3, psi_i_p5_m3, a_i_p5_m3, psi_a_p5_m3);
+    prodsum_psi_a_epi16(psi_r_p5_m5, a_r_p5_m5, psi_i_p5_m5, a_i_p5_m5, psi_a_p5_m5);
+    prodsum_psi_a_epi16(psi_r_p5_m7, a_r_p5_m7, psi_i_p5_m7, a_i_p5_m7, psi_a_p5_m7);
+    prodsum_psi_a_epi16(psi_r_p3_p7, a_r_p3_p7, psi_i_p3_p7, a_i_p3_p7, psi_a_p3_p7);
+    prodsum_psi_a_epi16(psi_r_p3_p5, a_r_p3_p5, psi_i_p3_p5, a_i_p3_p5, psi_a_p3_p5);
+    prodsum_psi_a_epi16(psi_r_p3_p3, a_r_p3_p3, psi_i_p3_p3, a_i_p3_p3, psi_a_p3_p3);
+    prodsum_psi_a_epi16(psi_r_p3_p1, a_r_p3_p1, psi_i_p3_p1, a_i_p3_p1, psi_a_p3_p1);
+    prodsum_psi_a_epi16(psi_r_p3_m1, a_r_p3_m1, psi_i_p3_m1, a_i_p3_m1, psi_a_p3_m1);
+    prodsum_psi_a_epi16(psi_r_p3_m3, a_r_p3_m3, psi_i_p3_m3, a_i_p3_m3, psi_a_p3_m3);
+    prodsum_psi_a_epi16(psi_r_p3_m5, a_r_p3_m5, psi_i_p3_m5, a_i_p3_m5, psi_a_p3_m5);
+    prodsum_psi_a_epi16(psi_r_p3_m7, a_r_p3_m7, psi_i_p3_m7, a_i_p3_m7, psi_a_p3_m7);
+    prodsum_psi_a_epi16(psi_r_p1_p7, a_r_p1_p7, psi_i_p1_p7, a_i_p1_p7, psi_a_p1_p7);
+    prodsum_psi_a_epi16(psi_r_p1_p5, a_r_p1_p5, psi_i_p1_p5, a_i_p1_p5, psi_a_p1_p5);
+    prodsum_psi_a_epi16(psi_r_p1_p3, a_r_p1_p3, psi_i_p1_p3, a_i_p1_p3, psi_a_p1_p3);
+    prodsum_psi_a_epi16(psi_r_p1_p1, a_r_p1_p1, psi_i_p1_p1, a_i_p1_p1, psi_a_p1_p1);
+    prodsum_psi_a_epi16(psi_r_p1_m1, a_r_p1_m1, psi_i_p1_m1, a_i_p1_m1, psi_a_p1_m1);
+    prodsum_psi_a_epi16(psi_r_p1_m3, a_r_p1_m3, psi_i_p1_m3, a_i_p1_m3, psi_a_p1_m3);
+    prodsum_psi_a_epi16(psi_r_p1_m5, a_r_p1_m5, psi_i_p1_m5, a_i_p1_m5, psi_a_p1_m5);
+    prodsum_psi_a_epi16(psi_r_p1_m7, a_r_p1_m7, psi_i_p1_m7, a_i_p1_m7, psi_a_p1_m7);
+    prodsum_psi_a_epi16(psi_r_m1_p7, a_r_m1_p7, psi_i_m1_p7, a_i_m1_p7, psi_a_m1_p7);
+    prodsum_psi_a_epi16(psi_r_m1_p5, a_r_m1_p5, psi_i_m1_p5, a_i_m1_p5, psi_a_m1_p5);
+    prodsum_psi_a_epi16(psi_r_m1_p3, a_r_m1_p3, psi_i_m1_p3, a_i_m1_p3, psi_a_m1_p3);
+    prodsum_psi_a_epi16(psi_r_m1_p1, a_r_m1_p1, psi_i_m1_p1, a_i_m1_p1, psi_a_m1_p1);
+    prodsum_psi_a_epi16(psi_r_m1_m1, a_r_m1_m1, psi_i_m1_m1, a_i_m1_m1, psi_a_m1_m1);
+    prodsum_psi_a_epi16(psi_r_m1_m3, a_r_m1_m3, psi_i_m1_m3, a_i_m1_m3, psi_a_m1_m3);
+    prodsum_psi_a_epi16(psi_r_m1_m5, a_r_m1_m5, psi_i_m1_m5, a_i_m1_m5, psi_a_m1_m5);
+    prodsum_psi_a_epi16(psi_r_m1_m7, a_r_m1_m7, psi_i_m1_m7, a_i_m1_m7, psi_a_m1_m7);
+    prodsum_psi_a_epi16(psi_r_m3_p7, a_r_m3_p7, psi_i_m3_p7, a_i_m3_p7, psi_a_m3_p7);
+    prodsum_psi_a_epi16(psi_r_m3_p5, a_r_m3_p5, psi_i_m3_p5, a_i_m3_p5, psi_a_m3_p5);
+    prodsum_psi_a_epi16(psi_r_m3_p3, a_r_m3_p3, psi_i_m3_p3, a_i_m3_p3, psi_a_m3_p3);
+    prodsum_psi_a_epi16(psi_r_m3_p1, a_r_m3_p1, psi_i_m3_p1, a_i_m3_p1, psi_a_m3_p1);
+    prodsum_psi_a_epi16(psi_r_m3_m1, a_r_m3_m1, psi_i_m3_m1, a_i_m3_m1, psi_a_m3_m1);
+    prodsum_psi_a_epi16(psi_r_m3_m3, a_r_m3_m3, psi_i_m3_m3, a_i_m3_m3, psi_a_m3_m3);
+    prodsum_psi_a_epi16(psi_r_m3_m5, a_r_m3_m5, psi_i_m3_m5, a_i_m3_m5, psi_a_m3_m5);
+    prodsum_psi_a_epi16(psi_r_m3_m7, a_r_m3_m7, psi_i_m3_m7, a_i_m3_m7, psi_a_m3_m7);
+    prodsum_psi_a_epi16(psi_r_m5_p7, a_r_m5_p7, psi_i_m5_p7, a_i_m5_p7, psi_a_m5_p7);
+    prodsum_psi_a_epi16(psi_r_m5_p5, a_r_m5_p5, psi_i_m5_p5, a_i_m5_p5, psi_a_m5_p5);
+    prodsum_psi_a_epi16(psi_r_m5_p3, a_r_m5_p3, psi_i_m5_p3, a_i_m5_p3, psi_a_m5_p3);
+    prodsum_psi_a_epi16(psi_r_m5_p1, a_r_m5_p1, psi_i_m5_p1, a_i_m5_p1, psi_a_m5_p1);
+    prodsum_psi_a_epi16(psi_r_m5_m1, a_r_m5_m1, psi_i_m5_m1, a_i_m5_m1, psi_a_m5_m1);
+    prodsum_psi_a_epi16(psi_r_m5_m3, a_r_m5_m3, psi_i_m5_m3, a_i_m5_m3, psi_a_m5_m3);
+    prodsum_psi_a_epi16(psi_r_m5_m5, a_r_m5_m5, psi_i_m5_m5, a_i_m5_m5, psi_a_m5_m5);
+    prodsum_psi_a_epi16(psi_r_m5_m7, a_r_m5_m7, psi_i_m5_m7, a_i_m5_m7, psi_a_m5_m7);
+    prodsum_psi_a_epi16(psi_r_m7_p7, a_r_m7_p7, psi_i_m7_p7, a_i_m7_p7, psi_a_m7_p7);
+    prodsum_psi_a_epi16(psi_r_m7_p5, a_r_m7_p5, psi_i_m7_p5, a_i_m7_p5, psi_a_m7_p5);
+    prodsum_psi_a_epi16(psi_r_m7_p3, a_r_m7_p3, psi_i_m7_p3, a_i_m7_p3, psi_a_m7_p3);
+    prodsum_psi_a_epi16(psi_r_m7_p1, a_r_m7_p1, psi_i_m7_p1, a_i_m7_p1, psi_a_m7_p1);
+    prodsum_psi_a_epi16(psi_r_m7_m1, a_r_m7_m1, psi_i_m7_m1, a_i_m7_m1, psi_a_m7_m1);
+    prodsum_psi_a_epi16(psi_r_m7_m3, a_r_m7_m3, psi_i_m7_m3, a_i_m7_m3, psi_a_m7_m3);
+    prodsum_psi_a_epi16(psi_r_m7_m5, a_r_m7_m5, psi_i_m7_m5, a_i_m7_m5, psi_a_m7_m5);
+    prodsum_psi_a_epi16(psi_r_m7_m7, a_r_m7_m7, psi_i_m7_m7, a_i_m7_m7, psi_a_m7_m7);
+
+    // Multiply by sqrt(2)
+    psi_a_p7_p7 = _mm256_mulhi_epi16(psi_a_p7_p7, ONE_OVER_SQRT_2);
+    psi_a_p7_p7 = _mm256_slli_epi16(psi_a_p7_p7, 2);
+    psi_a_p7_p5 = _mm256_mulhi_epi16(psi_a_p7_p5, ONE_OVER_SQRT_2);
+    psi_a_p7_p5 = _mm256_slli_epi16(psi_a_p7_p5, 2);
+    psi_a_p7_p3 = _mm256_mulhi_epi16(psi_a_p7_p3, ONE_OVER_SQRT_2);
+    psi_a_p7_p3 = _mm256_slli_epi16(psi_a_p7_p3, 2);
+    psi_a_p7_p1 = _mm256_mulhi_epi16(psi_a_p7_p1, ONE_OVER_SQRT_2);
+    psi_a_p7_p1 = _mm256_slli_epi16(psi_a_p7_p1, 2);
+    psi_a_p7_m1 = _mm256_mulhi_epi16(psi_a_p7_m1, ONE_OVER_SQRT_2);
+    psi_a_p7_m1 = _mm256_slli_epi16(psi_a_p7_m1, 2);
+    psi_a_p7_m3 = _mm256_mulhi_epi16(psi_a_p7_m3, ONE_OVER_SQRT_2);
+    psi_a_p7_m3 = _mm256_slli_epi16(psi_a_p7_m3, 2);
+    psi_a_p7_m5 = _mm256_mulhi_epi16(psi_a_p7_m5, ONE_OVER_SQRT_2);
+    psi_a_p7_m5 = _mm256_slli_epi16(psi_a_p7_m5, 2);
+    psi_a_p7_m7 = _mm256_mulhi_epi16(psi_a_p7_m7, ONE_OVER_SQRT_2);
+    psi_a_p7_m7 = _mm256_slli_epi16(psi_a_p7_m7, 2);
+    psi_a_p5_p7 = _mm256_mulhi_epi16(psi_a_p5_p7, ONE_OVER_SQRT_2);
+    psi_a_p5_p7 = _mm256_slli_epi16(psi_a_p5_p7, 2);
+    psi_a_p5_p5 = _mm256_mulhi_epi16(psi_a_p5_p5, ONE_OVER_SQRT_2);
+    psi_a_p5_p5 = _mm256_slli_epi16(psi_a_p5_p5, 2);
+    psi_a_p5_p3 = _mm256_mulhi_epi16(psi_a_p5_p3, ONE_OVER_SQRT_2);
+    psi_a_p5_p3 = _mm256_slli_epi16(psi_a_p5_p3, 2);
+    psi_a_p5_p1 = _mm256_mulhi_epi16(psi_a_p5_p1, ONE_OVER_SQRT_2);
+    psi_a_p5_p1 = _mm256_slli_epi16(psi_a_p5_p1, 2);
+    psi_a_p5_m1 = _mm256_mulhi_epi16(psi_a_p5_m1, ONE_OVER_SQRT_2);
+    psi_a_p5_m1 = _mm256_slli_epi16(psi_a_p5_m1, 2);
+    psi_a_p5_m3 = _mm256_mulhi_epi16(psi_a_p5_m3, ONE_OVER_SQRT_2);
+    psi_a_p5_m3 = _mm256_slli_epi16(psi_a_p5_m3, 2);
+    psi_a_p5_m5 = _mm256_mulhi_epi16(psi_a_p5_m5, ONE_OVER_SQRT_2);
+    psi_a_p5_m5 = _mm256_slli_epi16(psi_a_p5_m5, 2);
+    psi_a_p5_m7 = _mm256_mulhi_epi16(psi_a_p5_m7, ONE_OVER_SQRT_2);
+    psi_a_p5_m7 = _mm256_slli_epi16(psi_a_p5_m7, 2);
+    psi_a_p3_p7 = _mm256_mulhi_epi16(psi_a_p3_p7, ONE_OVER_SQRT_2);
+    psi_a_p3_p7 = _mm256_slli_epi16(psi_a_p3_p7, 2);
+    psi_a_p3_p5 = _mm256_mulhi_epi16(psi_a_p3_p5, ONE_OVER_SQRT_2);
+    psi_a_p3_p5 = _mm256_slli_epi16(psi_a_p3_p5, 2);
+    psi_a_p3_p3 = _mm256_mulhi_epi16(psi_a_p3_p3, ONE_OVER_SQRT_2);
+    psi_a_p3_p3 = _mm256_slli_epi16(psi_a_p3_p3, 2);
+    psi_a_p3_p1 = _mm256_mulhi_epi16(psi_a_p3_p1, ONE_OVER_SQRT_2);
+    psi_a_p3_p1 = _mm256_slli_epi16(psi_a_p3_p1, 2);
+    psi_a_p3_m1 = _mm256_mulhi_epi16(psi_a_p3_m1, ONE_OVER_SQRT_2);
+    psi_a_p3_m1 = _mm256_slli_epi16(psi_a_p3_m1, 2);
+    psi_a_p3_m3 = _mm256_mulhi_epi16(psi_a_p3_m3, ONE_OVER_SQRT_2);
+    psi_a_p3_m3 = _mm256_slli_epi16(psi_a_p3_m3, 2);
+    psi_a_p3_m5 = _mm256_mulhi_epi16(psi_a_p3_m5, ONE_OVER_SQRT_2);
+    psi_a_p3_m5 = _mm256_slli_epi16(psi_a_p3_m5, 2);
+    psi_a_p3_m7 = _mm256_mulhi_epi16(psi_a_p3_m7, ONE_OVER_SQRT_2);
+    psi_a_p3_m7 = _mm256_slli_epi16(psi_a_p3_m7, 2);
+    psi_a_p1_p7 = _mm256_mulhi_epi16(psi_a_p1_p7, ONE_OVER_SQRT_2);
+    psi_a_p1_p7 = _mm256_slli_epi16(psi_a_p1_p7, 2);
+    psi_a_p1_p5 = _mm256_mulhi_epi16(psi_a_p1_p5, ONE_OVER_SQRT_2);
+    psi_a_p1_p5 = _mm256_slli_epi16(psi_a_p1_p5, 2);
+    psi_a_p1_p3 = _mm256_mulhi_epi16(psi_a_p1_p3, ONE_OVER_SQRT_2);
+    psi_a_p1_p3 = _mm256_slli_epi16(psi_a_p1_p3, 2);
+    psi_a_p1_p1 = _mm256_mulhi_epi16(psi_a_p1_p1, ONE_OVER_SQRT_2);
+    psi_a_p1_p1 = _mm256_slli_epi16(psi_a_p1_p1, 2);
+    psi_a_p1_m1 = _mm256_mulhi_epi16(psi_a_p1_m1, ONE_OVER_SQRT_2);
+    psi_a_p1_m1 = _mm256_slli_epi16(psi_a_p1_m1, 2);
+    psi_a_p1_m3 = _mm256_mulhi_epi16(psi_a_p1_m3, ONE_OVER_SQRT_2);
+    psi_a_p1_m3 = _mm256_slli_epi16(psi_a_p1_m3, 2);
+    psi_a_p1_m5 = _mm256_mulhi_epi16(psi_a_p1_m5, ONE_OVER_SQRT_2);
+    psi_a_p1_m5 = _mm256_slli_epi16(psi_a_p1_m5, 2);
+    psi_a_p1_m7 = _mm256_mulhi_epi16(psi_a_p1_m7, ONE_OVER_SQRT_2);
+    psi_a_p1_m7 = _mm256_slli_epi16(psi_a_p1_m7, 2);
+    psi_a_m1_p7 = _mm256_mulhi_epi16(psi_a_m1_p7, ONE_OVER_SQRT_2);
+    psi_a_m1_p7 = _mm256_slli_epi16(psi_a_m1_p7, 2);
+    psi_a_m1_p5 = _mm256_mulhi_epi16(psi_a_m1_p5, ONE_OVER_SQRT_2);
+    psi_a_m1_p5 = _mm256_slli_epi16(psi_a_m1_p5, 2);
+    psi_a_m1_p3 = _mm256_mulhi_epi16(psi_a_m1_p3, ONE_OVER_SQRT_2);
+    psi_a_m1_p3 = _mm256_slli_epi16(psi_a_m1_p3, 2);
+    psi_a_m1_p1 = _mm256_mulhi_epi16(psi_a_m1_p1, ONE_OVER_SQRT_2);
+    psi_a_m1_p1 = _mm256_slli_epi16(psi_a_m1_p1, 2);
+    psi_a_m1_m1 = _mm256_mulhi_epi16(psi_a_m1_m1, ONE_OVER_SQRT_2);
+    psi_a_m1_m1 = _mm256_slli_epi16(psi_a_m1_m1, 2);
+    psi_a_m1_m3 = _mm256_mulhi_epi16(psi_a_m1_m3, ONE_OVER_SQRT_2);
+    psi_a_m1_m3 = _mm256_slli_epi16(psi_a_m1_m3, 2);
+    psi_a_m1_m5 = _mm256_mulhi_epi16(psi_a_m1_m5, ONE_OVER_SQRT_2);
+    psi_a_m1_m5 = _mm256_slli_epi16(psi_a_m1_m5, 2);
+    psi_a_m1_m7 = _mm256_mulhi_epi16(psi_a_m1_m7, ONE_OVER_SQRT_2);
+    psi_a_m1_m7 = _mm256_slli_epi16(psi_a_m1_m7, 2);
+    psi_a_m3_p7 = _mm256_mulhi_epi16(psi_a_m3_p7, ONE_OVER_SQRT_2);
+    psi_a_m3_p7 = _mm256_slli_epi16(psi_a_m3_p7, 2);
+    psi_a_m3_p5 = _mm256_mulhi_epi16(psi_a_m3_p5, ONE_OVER_SQRT_2);
+    psi_a_m3_p5 = _mm256_slli_epi16(psi_a_m3_p5, 2);
+    psi_a_m3_p3 = _mm256_mulhi_epi16(psi_a_m3_p3, ONE_OVER_SQRT_2);
+    psi_a_m3_p3 = _mm256_slli_epi16(psi_a_m3_p3, 2);
+    psi_a_m3_p1 = _mm256_mulhi_epi16(psi_a_m3_p1, ONE_OVER_SQRT_2);
+    psi_a_m3_p1 = _mm256_slli_epi16(psi_a_m3_p1, 2);
+    psi_a_m3_m1 = _mm256_mulhi_epi16(psi_a_m3_m1, ONE_OVER_SQRT_2);
+    psi_a_m3_m1 = _mm256_slli_epi16(psi_a_m3_m1, 2);
+    psi_a_m3_m3 = _mm256_mulhi_epi16(psi_a_m3_m3, ONE_OVER_SQRT_2);
+    psi_a_m3_m3 = _mm256_slli_epi16(psi_a_m3_m3, 2);
+    psi_a_m3_m5 = _mm256_mulhi_epi16(psi_a_m3_m5, ONE_OVER_SQRT_2);
+    psi_a_m3_m5 = _mm256_slli_epi16(psi_a_m3_m5, 2);
+    psi_a_m3_m7 = _mm256_mulhi_epi16(psi_a_m3_m7, ONE_OVER_SQRT_2);
+    psi_a_m3_m7 = _mm256_slli_epi16(psi_a_m3_m7, 2);
+    psi_a_m5_p7 = _mm256_mulhi_epi16(psi_a_m5_p7, ONE_OVER_SQRT_2);
+    psi_a_m5_p7 = _mm256_slli_epi16(psi_a_m5_p7, 2);
+    psi_a_m5_p5 = _mm256_mulhi_epi16(psi_a_m5_p5, ONE_OVER_SQRT_2);
+    psi_a_m5_p5 = _mm256_slli_epi16(psi_a_m5_p5, 2);
+    psi_a_m5_p3 = _mm256_mulhi_epi16(psi_a_m5_p3, ONE_OVER_SQRT_2);
+    psi_a_m5_p3 = _mm256_slli_epi16(psi_a_m5_p3, 2);
+    psi_a_m5_p1 = _mm256_mulhi_epi16(psi_a_m5_p1, ONE_OVER_SQRT_2);
+    psi_a_m5_p1 = _mm256_slli_epi16(psi_a_m5_p1, 2);
+    psi_a_m5_m1 = _mm256_mulhi_epi16(psi_a_m5_m1, ONE_OVER_SQRT_2);
+    psi_a_m5_m1 = _mm256_slli_epi16(psi_a_m5_m1, 2);
+    psi_a_m5_m3 = _mm256_mulhi_epi16(psi_a_m5_m3, ONE_OVER_SQRT_2);
+    psi_a_m5_m3 = _mm256_slli_epi16(psi_a_m5_m3, 2);
+    psi_a_m5_m5 = _mm256_mulhi_epi16(psi_a_m5_m5, ONE_OVER_SQRT_2);
+    psi_a_m5_m5 = _mm256_slli_epi16(psi_a_m5_m5, 2);
+    psi_a_m5_m7 = _mm256_mulhi_epi16(psi_a_m5_m7, ONE_OVER_SQRT_2);
+    psi_a_m5_m7 = _mm256_slli_epi16(psi_a_m5_m7, 2);
+    psi_a_m7_p7 = _mm256_mulhi_epi16(psi_a_m7_p7, ONE_OVER_SQRT_2);
+    psi_a_m7_p7 = _mm256_slli_epi16(psi_a_m7_p7, 2);
+    psi_a_m7_p5 = _mm256_mulhi_epi16(psi_a_m7_p5, ONE_OVER_SQRT_2);
+    psi_a_m7_p5 = _mm256_slli_epi16(psi_a_m7_p5, 2);
+    psi_a_m7_p3 = _mm256_mulhi_epi16(psi_a_m7_p3, ONE_OVER_SQRT_2);
+    psi_a_m7_p3 = _mm256_slli_epi16(psi_a_m7_p3, 2);
+    psi_a_m7_p1 = _mm256_mulhi_epi16(psi_a_m7_p1, ONE_OVER_SQRT_2);
+    psi_a_m7_p1 = _mm256_slli_epi16(psi_a_m7_p1, 2);
+    psi_a_m7_m1 = _mm256_mulhi_epi16(psi_a_m7_m1, ONE_OVER_SQRT_2);
+    psi_a_m7_m1 = _mm256_slli_epi16(psi_a_m7_m1, 2);
+    psi_a_m7_m3 = _mm256_mulhi_epi16(psi_a_m7_m3, ONE_OVER_SQRT_2);
+    psi_a_m7_m3 = _mm256_slli_epi16(psi_a_m7_m3, 2);
+    psi_a_m7_m5 = _mm256_mulhi_epi16(psi_a_m7_m5, ONE_OVER_SQRT_2);
+    psi_a_m7_m5 = _mm256_slli_epi16(psi_a_m7_m5, 2);
+    psi_a_m7_m7 = _mm256_mulhi_epi16(psi_a_m7_m7, ONE_OVER_SQRT_2);
+    psi_a_m7_m7 = _mm256_slli_epi16(psi_a_m7_m7, 2);
+
+    // Calculation of a group of two terms in the bit metric involving squares of interference
+    square_a_64qam_epi16(a_r_p7_p7, a_i_p7_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p7);
+    square_a_64qam_epi16(a_r_p7_p5, a_i_p7_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p5);
+    square_a_64qam_epi16(a_r_p7_p3, a_i_p7_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p3);
+    square_a_64qam_epi16(a_r_p7_p1, a_i_p7_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p1);
+    square_a_64qam_epi16(a_r_p7_m1, a_i_p7_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m1);
+    square_a_64qam_epi16(a_r_p7_m3, a_i_p7_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m3);
+    square_a_64qam_epi16(a_r_p7_m5, a_i_p7_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m5);
+    square_a_64qam_epi16(a_r_p7_m7, a_i_p7_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m7);
+    square_a_64qam_epi16(a_r_p5_p7, a_i_p5_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p7);
+    square_a_64qam_epi16(a_r_p5_p5, a_i_p5_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p5);
+    square_a_64qam_epi16(a_r_p5_p3, a_i_p5_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p3);
+    square_a_64qam_epi16(a_r_p5_p1, a_i_p5_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p1);
+    square_a_64qam_epi16(a_r_p5_m1, a_i_p5_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m1);
+    square_a_64qam_epi16(a_r_p5_m3, a_i_p5_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m3);
+    square_a_64qam_epi16(a_r_p5_m5, a_i_p5_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m5);
+    square_a_64qam_epi16(a_r_p5_m7, a_i_p5_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m7);
+    square_a_64qam_epi16(a_r_p3_p7, a_i_p3_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p7);
+    square_a_64qam_epi16(a_r_p3_p5, a_i_p3_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p5);
+    square_a_64qam_epi16(a_r_p3_p3, a_i_p3_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p3);
+    square_a_64qam_epi16(a_r_p3_p1, a_i_p3_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p1);
+    square_a_64qam_epi16(a_r_p3_m1, a_i_p3_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m1);
+    square_a_64qam_epi16(a_r_p3_m3, a_i_p3_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m3);
+    square_a_64qam_epi16(a_r_p3_m5, a_i_p3_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m5);
+    square_a_64qam_epi16(a_r_p3_m7, a_i_p3_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m7);
+    square_a_64qam_epi16(a_r_p1_p7, a_i_p1_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p7);
+    square_a_64qam_epi16(a_r_p1_p5, a_i_p1_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p5);
+    square_a_64qam_epi16(a_r_p1_p3, a_i_p1_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p3);
+    square_a_64qam_epi16(a_r_p1_p1, a_i_p1_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p1);
+    square_a_64qam_epi16(a_r_p1_m1, a_i_p1_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m1);
+    square_a_64qam_epi16(a_r_p1_m3, a_i_p1_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m3);
+    square_a_64qam_epi16(a_r_p1_m5, a_i_p1_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m5);
+    square_a_64qam_epi16(a_r_p1_m7, a_i_p1_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m7);
+    square_a_64qam_epi16(a_r_m1_p7, a_i_m1_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p7);
+    square_a_64qam_epi16(a_r_m1_p5, a_i_m1_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p5);
+    square_a_64qam_epi16(a_r_m1_p3, a_i_m1_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p3);
+    square_a_64qam_epi16(a_r_m1_p1, a_i_m1_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p1);
+    square_a_64qam_epi16(a_r_m1_m1, a_i_m1_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m1);
+    square_a_64qam_epi16(a_r_m1_m3, a_i_m1_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m3);
+    square_a_64qam_epi16(a_r_m1_m5, a_i_m1_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m5);
+    square_a_64qam_epi16(a_r_m1_m7, a_i_m1_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m7);
+    square_a_64qam_epi16(a_r_m3_p7, a_i_m3_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p7);
+    square_a_64qam_epi16(a_r_m3_p5, a_i_m3_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p5);
+    square_a_64qam_epi16(a_r_m3_p3, a_i_m3_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p3);
+    square_a_64qam_epi16(a_r_m3_p1, a_i_m3_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p1);
+    square_a_64qam_epi16(a_r_m3_m1, a_i_m3_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m1);
+    square_a_64qam_epi16(a_r_m3_m3, a_i_m3_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m3);
+    square_a_64qam_epi16(a_r_m3_m5, a_i_m3_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m5);
+    square_a_64qam_epi16(a_r_m3_m7, a_i_m3_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m7);
+    square_a_64qam_epi16(a_r_m5_p7, a_i_m5_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p7);
+    square_a_64qam_epi16(a_r_m5_p5, a_i_m5_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p5);
+    square_a_64qam_epi16(a_r_m5_p3, a_i_m5_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p3);
+    square_a_64qam_epi16(a_r_m5_p1, a_i_m5_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p1);
+    square_a_64qam_epi16(a_r_m5_m1, a_i_m5_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m1);
+    square_a_64qam_epi16(a_r_m5_m3, a_i_m5_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m3);
+    square_a_64qam_epi16(a_r_m5_m5, a_i_m5_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m5);
+    square_a_64qam_epi16(a_r_m5_m7, a_i_m5_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m7);
+    square_a_64qam_epi16(a_r_m7_p7, a_i_m7_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p7);
+    square_a_64qam_epi16(a_r_m7_p5, a_i_m7_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p5);
+    square_a_64qam_epi16(a_r_m7_p3, a_i_m7_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p3);
+    square_a_64qam_epi16(a_r_m7_p1, a_i_m7_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p1);
+    square_a_64qam_epi16(a_r_m7_m1, a_i_m7_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m1);
+    square_a_64qam_epi16(a_r_m7_m3, a_i_m7_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m3);
+    square_a_64qam_epi16(a_r_m7_m5, a_i_m7_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m5);
+    square_a_64qam_epi16(a_r_m7_m7, a_i_m7_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m7);
+
+    // Computing different multiples of ||h0||^2
+    // x=1, y=1
+    ch_mag_2_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,ONE_OVER_FOUR_SQRT_42);
+    ch_mag_2_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_2_over_42_with_sigma2,1);
+    // x=1, y=3
+    ch_mag_10_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,FIVE_OVER_FOUR_SQRT_42);
+    ch_mag_10_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_10_over_42_with_sigma2,1);
+    // x=1, x=5
+    ch_mag_26_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,THIRTEEN_OVER_FOUR_SQRT_42);
+    ch_mag_26_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_26_over_42_with_sigma2,1);
+    // x=1, y=7
+    ch_mag_50_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
+    ch_mag_50_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
+    // x=3, y=3
+    ch_mag_18_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,NINE_OVER_FOUR_SQRT_42);
+    ch_mag_18_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_18_over_42_with_sigma2,1);
+    // x=3, y=5
+    ch_mag_34_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,SEVENTEEN_OVER_FOUR_SQRT_42);
+    ch_mag_34_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_34_over_42_with_sigma2,1);
+    // x=3, y=7
+    ch_mag_58_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYNINE_OVER_FOUR_SQRT_42);
+    ch_mag_58_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_58_over_42_with_sigma2,2);
+    // x=5, y=5
+    ch_mag_50_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
+    ch_mag_50_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
+    // x=5, y=7
+    ch_mag_74_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,THIRTYSEVEN_OVER_FOUR_SQRT_42);
+    ch_mag_74_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_74_over_42_with_sigma2,2);
+    // x=7, y=7
+    ch_mag_98_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,FORTYNINE_OVER_FOUR_SQRT_42);
+    ch_mag_98_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_98_over_42_with_sigma2,2);
+
+    // Computing Metrics
+    xmm0 = _mm256_subs_epi16(psi_a_p7_p7, a_sq_p7_p7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_7);
+    bit_met_p7_p7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_p5, a_sq_p7_p5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_5);
+    bit_met_p7_p5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_p3, a_sq_p7_p3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_3);
+    bit_met_p7_p3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_p1, a_sq_p7_p1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_1);
+    bit_met_p7_p1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_m1, a_sq_p7_m1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_1);
+    bit_met_p7_m1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_m3, a_sq_p7_m3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_3);
+    bit_met_p7_m3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_m5, a_sq_p7_m5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_5);
+    bit_met_p7_m5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p7_m7, a_sq_p7_m7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_7);
+    bit_met_p7_m7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_p7, a_sq_p5_p7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_7);
+    bit_met_p5_p7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_p5, a_sq_p5_p5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_5);
+    bit_met_p5_p5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_p3, a_sq_p5_p3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_3);
+    bit_met_p5_p3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_p1, a_sq_p5_p1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_1);
+    bit_met_p5_p1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_m1, a_sq_p5_m1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_1);
+    bit_met_p5_m1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_m3, a_sq_p5_m3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_3);
+    bit_met_p5_m3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_m5, a_sq_p5_m5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_5);
+    bit_met_p5_m5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p5_m7, a_sq_p5_m7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_7);
+    bit_met_p5_m7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_p7, a_sq_p3_p7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_7);
+    bit_met_p3_p7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_p5, a_sq_p3_p5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_5);
+    bit_met_p3_p5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_p3, a_sq_p3_p3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_3);
+    bit_met_p3_p3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_p1, a_sq_p3_p1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_1);
+    bit_met_p3_p1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_m1, a_sq_p3_m1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_1);
+    bit_met_p3_m1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_m3, a_sq_p3_m3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_3);
+    bit_met_p3_m3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_m5, a_sq_p3_m5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_5);
+    bit_met_p3_m5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p3_m7, a_sq_p3_m7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_7);
+    bit_met_p3_m7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_p7, a_sq_p1_p7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_7);
+    bit_met_p1_p7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_p5, a_sq_p1_p5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_5);
+    bit_met_p1_p5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_p3, a_sq_p1_p3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_3);
+    bit_met_p1_p3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_p1, a_sq_p1_p1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_1);
+    bit_met_p1_p1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_m1, a_sq_p1_m1);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_1);
+    bit_met_p1_m1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_m3, a_sq_p1_m3);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_3);
+    bit_met_p1_m3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_m5, a_sq_p1_m5);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_5);
+    bit_met_p1_m5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_p1_m7, a_sq_p1_m7);
+    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_7);
+    bit_met_p1_m7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+
+    xmm0 = _mm256_subs_epi16(psi_a_m1_p7, a_sq_m1_p7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_7);
+    bit_met_m1_p7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_p5, a_sq_m1_p5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_5);
+    bit_met_m1_p5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_p3, a_sq_m1_p3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_3);
+    bit_met_m1_p3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_p1, a_sq_m1_p1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_1);
+    bit_met_m1_p1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_m1, a_sq_m1_m1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_1);
+    bit_met_m1_m1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_m3, a_sq_m1_m3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_3);
+    bit_met_m1_m3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_m5, a_sq_m1_m5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_5);
+    bit_met_m1_m5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m1_m7, a_sq_m1_m7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_7);
+    bit_met_m1_m7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_p7, a_sq_m3_p7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_7);
+    bit_met_m3_p7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_p5, a_sq_m3_p5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_5);
+    bit_met_m3_p5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_p3, a_sq_m3_p3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_3);
+    bit_met_m3_p3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_p1, a_sq_m3_p1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_1);
+    bit_met_m3_p1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_m1, a_sq_m3_m1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_1);
+    bit_met_m3_m1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_m3, a_sq_m3_m3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_3);
+    bit_met_m3_m3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_m5, a_sq_m3_m5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_5);
+    bit_met_m3_m5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m3_m7, a_sq_m3_m7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_7);
+    bit_met_m3_m7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_p7, a_sq_m5_p7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_7);
+    bit_met_m5_p7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_p5, a_sq_m5_p5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_5);
+    bit_met_m5_p5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_p3, a_sq_m5_p3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_3);
+    bit_met_m5_p3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_p1, a_sq_m5_p1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_1);
+    bit_met_m5_p1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_m1, a_sq_m5_m1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_1);
+    bit_met_m5_m1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_m3, a_sq_m5_m3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_3);
+    bit_met_m5_m3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_m5, a_sq_m5_m5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_5);
+    bit_met_m5_m5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m5_m7, a_sq_m5_m7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_7);
+    bit_met_m5_m7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_p7, a_sq_m7_p7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_7);
+    bit_met_m7_p7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_p5, a_sq_m7_p5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_5);
+    bit_met_m7_p5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_p3, a_sq_m7_p3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_3);
+    bit_met_m7_p3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_p1, a_sq_m7_p1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_1);
+    bit_met_m7_p1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_m1, a_sq_m7_m1);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_1);
+    bit_met_m7_m1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_m3, a_sq_m7_m3);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_3);
+    bit_met_m7_m3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_m5, a_sq_m7_m5);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_5);
+    bit_met_m7_m5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
+    xmm0 = _mm256_subs_epi16(psi_a_m7_m7, a_sq_m7_m7);
+    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_7);
+    bit_met_m7_m7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
+
+    // Detection for 1st bit (LTE mapping)
+    // bit = 1
+    xmm0 = _mm256_max_epi16(bit_met_m7_p7, bit_met_m7_p5);
+    xmm1 = _mm256_max_epi16(bit_met_m7_p3, bit_met_m7_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m7_m1, bit_met_m7_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m7_m5, bit_met_m7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m5_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m5_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m5_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m3_p7, bit_met_m3_p5);
+    xmm1 = _mm256_max_epi16(bit_met_m3_p3, bit_met_m3_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m3_m1, bit_met_m3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m3_m5, bit_met_m3_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m1_p5);
+    xmm1 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m1_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m1_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    // bit = 0
+    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p7_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p7_p1);
+    xmm2 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p7_m3);
+    xmm3 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p5_p7, bit_met_p5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p5_p3, bit_met_p5_p1);
+    xmm2 = _mm256_max_epi16(bit_met_p5_m1, bit_met_p5_m3);
+    xmm3 = _mm256_max_epi16(bit_met_p5_m5, bit_met_p5_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p3_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p3_p1);
+    xmm2 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p3_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p1_p7, bit_met_p1_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p1_p3, bit_met_p1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_p1_m1, bit_met_p1_m3);
+    xmm3 = _mm256_max_epi16(bit_met_p1_m5, bit_met_p1_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y0r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // Detection for 2nd bit (LTE mapping)
+    // bit = 1
+    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    // bit = 0
+    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y1r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // Detection for 3rd bit (LTE mapping)
+    xmm0 = _mm256_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    xmm0 = _mm256_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y2r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // Detection for 4th bit (LTE mapping)
+    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y0i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+
+    // Detection for 5th bit (LTE mapping)
+    xmm0 = _mm256_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    xmm0 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
+    xmm1 = _mm256_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
+    xmm2 = _mm256_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
+    xmm3 = _mm256_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y1i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // Detection for 6th bit (LTE mapping)
+    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
+    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
+
+    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
+    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
+    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
+    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
+    xmm4 = _mm256_max_epi16(xmm0, xmm1);
+    xmm5 = _mm256_max_epi16(xmm2, xmm3);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
+    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
+
+    y2i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);
+
+    // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
+    // RE 1
+    j = 48*i;
+    stream0_out[j + 0] = ((short *)&y0r)[0];
+    stream0_out[j + 1] = ((short *)&y1r)[0];
+    stream0_out[j + 2] = ((short *)&y2r)[0];
+    stream0_out[j + 3] = ((short *)&y0i)[0];
+    stream0_out[j + 4] = ((short *)&y1i)[0];
+    stream0_out[j + 5] = ((short *)&y2i)[0];
+    // RE 2
+    stream0_out[j + 6] = ((short *)&y0r)[1];
+    stream0_out[j + 7] = ((short *)&y1r)[1];
+    stream0_out[j + 8] = ((short *)&y2r)[1];
+    stream0_out[j + 9] = ((short *)&y0i)[1];
+    stream0_out[j + 10] = ((short *)&y1i)[1];
+    stream0_out[j + 11] = ((short *)&y2i)[1];
+    // RE 3
+    stream0_out[j + 12] = ((short *)&y0r)[2];
+    stream0_out[j + 13] = ((short *)&y1r)[2];
+    stream0_out[j + 14] = ((short *)&y2r)[2];
+    stream0_out[j + 15] = ((short *)&y0i)[2];
+    stream0_out[j + 16] = ((short *)&y1i)[2];
+    stream0_out[j + 17] = ((short *)&y2i)[2];
+    // RE 4
+    stream0_out[j + 18] = ((short *)&y0r)[3];
+    stream0_out[j + 19] = ((short *)&y1r)[3];
+    stream0_out[j + 20] = ((short *)&y2r)[3];
+    stream0_out[j + 21] = ((short *)&y0i)[3];
+    stream0_out[j + 22] = ((short *)&y1i)[3];
+    stream0_out[j + 23] = ((short *)&y2i)[3];
+    // RE 5
+    stream0_out[j + 24] = ((short *)&y0r)[4];
+    stream0_out[j + 25] = ((short *)&y1r)[4];
+    stream0_out[j + 26] = ((short *)&y2r)[4];
+    stream0_out[j + 27] = ((short *)&y0i)[4];
+    stream0_out[j + 28] = ((short *)&y1i)[4];
+    stream0_out[j + 29] = ((short *)&y2i)[4];
+    // RE 6
+    stream0_out[j + 30] = ((short *)&y0r)[5];
+    stream0_out[j + 31] = ((short *)&y1r)[5];
+    stream0_out[j + 32] = ((short *)&y2r)[5];
+    stream0_out[j + 33] = ((short *)&y0i)[5];
+    stream0_out[j + 34] = ((short *)&y1i)[5];
+    stream0_out[j + 35] = ((short *)&y2i)[5];
+    // RE 7
+    stream0_out[j + 36] = ((short *)&y0r)[6];
+    stream0_out[j + 37] = ((short *)&y1r)[6];
+    stream0_out[j + 38] = ((short *)&y2r)[6];
+    stream0_out[j + 39] = ((short *)&y0i)[6];
+    stream0_out[j + 40] = ((short *)&y1i)[6];
+    stream0_out[j + 41] = ((short *)&y2i)[6];
+    // RE 8
+    stream0_out[j + 42] = ((short *)&y0r)[7];
+    stream0_out[j + 43] = ((short *)&y1r)[7];
+    stream0_out[j + 44] = ((short *)&y2r)[7];
+    stream0_out[j + 45] = ((short *)&y0i)[7];
+    stream0_out[j + 46] = ((short *)&y1i)[7];
+    stream0_out[j + 47] = ((short *)&y2i)[7];
+
+    // RE 9
+    stream0_out[j + 48] = ((short *)&y0r)[8];
+    stream0_out[j + 49] = ((short *)&y1r)[8];
+    stream0_out[j + 50] = ((short *)&y2r)[8];
+    stream0_out[j + 51] = ((short *)&y0i)[8];
+    stream0_out[j + 52] = ((short *)&y1i)[8];
+    stream0_out[j + 53] = ((short *)&y2i)[8];
+    // RE 10
+    stream0_out[j + 54] = ((short *)&y0r)[9];
+    stream0_out[j + 55] = ((short *)&y1r)[9];
+    stream0_out[j + 56] = ((short *)&y2r)[9];
+    stream0_out[j + 57] = ((short *)&y0i)[9];
+    stream0_out[j + 58] = ((short *)&y1i)[9];
+    stream0_out[j + 59] = ((short *)&y2i)[9];
+    // RE 11
+    stream0_out[j + 60] = ((short *)&y0r)[10];
+    stream0_out[j + 61] = ((short *)&y1r)[10];
+    stream0_out[j + 62] = ((short *)&y2r)[10];
+    stream0_out[j + 63] = ((short *)&y0i)[10];
+    stream0_out[j + 64] = ((short *)&y1i)[10];
+    stream0_out[j + 65] = ((short *)&y2i)[10];
+    // RE 12
+    stream0_out[j + 66] = ((short *)&y0r)[11];
+    stream0_out[j + 67] = ((short *)&y1r)[11];
+    stream0_out[j + 68] = ((short *)&y2r)[11];
+    stream0_out[j + 69] = ((short *)&y0i)[11];
+    stream0_out[j + 70] = ((short *)&y1i)[11];
+    stream0_out[j + 71] = ((short *)&y2i)[11];
+    // RE 13
+    stream0_out[j + 72] = ((short *)&y0r)[12];
+    stream0_out[j + 73] = ((short *)&y1r)[12];
+    stream0_out[j + 74] = ((short *)&y2r)[12];
+    stream0_out[j + 75] = ((short *)&y0i)[12];
+    stream0_out[j + 76] = ((short *)&y1i)[12];
+    stream0_out[j + 77] = ((short *)&y2i)[12];
+    // RE 14
+    stream0_out[j + 78] = ((short *)&y0r)[13];
+    stream0_out[j + 79] = ((short *)&y1r)[13];
+    stream0_out[j + 80] = ((short *)&y2r)[13];
+    stream0_out[j + 81] = ((short *)&y0i)[13];
+    stream0_out[j + 82] = ((short *)&y1i)[13];
+    stream0_out[j + 83] = ((short *)&y2i)[13];
+    // RE 15
+    stream0_out[j + 84] = ((short *)&y0r)[14];
+    stream0_out[j + 85] = ((short *)&y1r)[14];
+    stream0_out[j + 86] = ((short *)&y2r)[14];
+    stream0_out[j + 87] = ((short *)&y0i)[14];
+    stream0_out[j + 88] = ((short *)&y1i)[14];
+    stream0_out[j + 89] = ((short *)&y2i)[14];
+    // RE 16
+    stream0_out[j + 90] = ((short *)&y0r)[15];
+    stream0_out[j + 91] = ((short *)&y1r)[15];
+    stream0_out[j + 92] = ((short *)&y2r)[15];
+    stream0_out[j + 93] = ((short *)&y0i)[15];
+    stream0_out[j + 94] = ((short *)&y1i)[15];
+    stream0_out[j + 95] = ((short *)&y2i)[15];
+
+#elif defined(__arm__)
+
+#endif
+
+  }
+
+#if defined(__x86_64__) || defined(__i386__)
+  _mm_empty();
+  _m_empty();
+#endif
+}
diff --git a/openair1/PHY/LTE_TRANSPORT/print_stats.c b/openair1/PHY/LTE_TRANSPORT/print_stats.c
index c8d4292e8d03c8866f6a053ce1c7a45d37b591ad..c3e40e1e72f323b108d740b7d5db815f76cf8cd1 100644
--- a/openair1/PHY/LTE_TRANSPORT/print_stats.c
+++ b/openair1/PHY/LTE_TRANSPORT/print_stats.c
@@ -107,9 +107,9 @@ int dump_ue_stats(PHY_VARS_UE *ue, UE_rxtx_proc_t *proc,char* buffer, int length
       len += sprintf(&buffer[len], "[UE PROC] Po_PUCCH = %d dBm (Po_NOMINAL_PUCCH %d dBm, g_pucch %d dB)\n", 
 		     get_PL(ue->Mod_id,ue->CC_id,0)+
 		     ue->frame_parms.ul_power_control_config_common.p0_NominalPUCCH+
-		     ue->dlsch[0][0]->g_pucch,
+		     ue->dlsch[0][0][0]->g_pucch,
 		     ue->frame_parms.ul_power_control_config_common.p0_NominalPUCCH,
-		     ue->dlsch[0][0]->g_pucch);
+		     ue->dlsch[0][0][0]->g_pucch);
     }
     //for (eNB=0;eNB<NUMBER_OF_eNB_MAX;eNB++) {
     for (eNB=0; eNB<1; eNB++) {
@@ -482,24 +482,24 @@ int dump_ue_stats(PHY_VARS_UE *ue, UE_rxtx_proc_t *proc,char* buffer, int length
         len += sprintf(&buffer[len], "[UE PROC] Mode 6 Wideband CQI eNB %d : %d dB\n",eNB,ue->measurements.precoded_cqi_dB[eNB][0]);
 
       for (harq_pid=0;harq_pid<8;harq_pid++) {
-	len+=sprintf(&buffer[len],"[UE PROC] eNB %d: CW 0 harq_pid %d, mcs %d:",eNB,harq_pid,ue->dlsch[0][0]->harq_processes[harq_pid]->mcs);
+	len+=sprintf(&buffer[len],"[UE PROC] eNB %d: CW 0 harq_pid %d, mcs %d:",eNB,harq_pid,ue->dlsch[0][0][0]->harq_processes[harq_pid]->mcs);
 	for (round=0;round<8;round++)
 	  len+=sprintf(&buffer[len],"%d/%d ",
-		       ue->dlsch[0][0]->harq_processes[harq_pid]->errors[round],
-		       ue->dlsch[0][0]->harq_processes[harq_pid]->trials[round]);
+		       ue->dlsch[0][0][0]->harq_processes[harq_pid]->errors[round],
+		       ue->dlsch[0][0][0]->harq_processes[harq_pid]->trials[round]);
 	len+=sprintf(&buffer[len],"\n");
       }
-      if (ue->dlsch[0] && ue->dlsch[0][0] && ue->dlsch[0][1]) {
-        len += sprintf(&buffer[len], "[UE PROC] Saved PMI for DLSCH eNB %d : %jx (%p)\n",eNB,pmi2hex_2Ar1(ue->dlsch[0][0]->pmi_alloc),ue->dlsch[0][0]);
+      if (ue->dlsch[0][0] && ue->dlsch[0][0][0] && ue->dlsch[0][0][1]) {
+        len += sprintf(&buffer[len], "[UE PROC] Saved PMI for DLSCH eNB %d : %jx (%p)\n",eNB,pmi2hex_2Ar1(ue->dlsch[0][0][0]->pmi_alloc),ue->dlsch[0][0][0]);
 
-        len += sprintf(&buffer[len], "[UE PROC] eNB %d: dl_power_off = %d\n",eNB,ue->dlsch[0][0]->harq_processes[0]->dl_power_off);
+        len += sprintf(&buffer[len], "[UE PROC] eNB %d: dl_power_off = %d\n",eNB,ue->dlsch[0][0][0]->harq_processes[0]->dl_power_off);
 
 	for (harq_pid=0;harq_pid<8;harq_pid++) {
-	  len+=sprintf(&buffer[len],"[UE PROC] eNB %d: CW 1 harq_pid %d, mcs %d:",eNB,harq_pid,ue->dlsch[0][1]->harq_processes[0]->mcs);
+	  len+=sprintf(&buffer[len],"[UE PROC] eNB %d: CW 1 harq_pid %d, mcs %d:",eNB,harq_pid,ue->dlsch[0][0][1]->harq_processes[0]->mcs);
 	  for (round=0;round<8;round++)
 	    len+=sprintf(&buffer[len],"%d/%d ",
-			 ue->dlsch[0][1]->harq_processes[harq_pid]->errors[round],
-			 ue->dlsch[0][1]->harq_processes[harq_pid]->trials[round]);
+			 ue->dlsch[0][0][1]->harq_processes[harq_pid]->errors[round],
+			 ue->dlsch[0][0][1]->harq_processes[harq_pid]->trials[round]);
 	  len+=sprintf(&buffer[len],"\n");
 	}
       }
diff --git a/openair1/PHY/LTE_TRANSPORT/proto.h b/openair1/PHY/LTE_TRANSPORT/proto.h
index 5fc8dea9867cee8eac0abf65256d5d0179628afc..fe7991e343ab200cf96d8d5f7a34f66a29a8fba0 100644
--- a/openair1/PHY/LTE_TRANSPORT/proto.h
+++ b/openair1/PHY/LTE_TRANSPORT/proto.h
@@ -704,6 +704,22 @@ void qam64_qam16(short *stream0_in,
                  short *rho01,
                  int length);
 
+/** \brief This function computes the LLRs for ML (max-logsum approximation) dual-stream 64QAM/16QAM reception.
+    @param stream0_in Input from channel compensated (MR combined) stream 0
+    @param stream1_in Input from channel compensated (MR combined) stream 1
+    @param ch_mag   Input from scaled channel magnitude square of h0'*g0
+    @param ch_mag_i Input from scaled channel magnitude square of h0'*g1
+    @param stream0_out Output from LLR unit for stream0
+    @param rho01 Cross-correlation between channels (MR combined)
+    @param length in complex channel outputs*/
+void qam64_qam16_avx2(short *stream0_in,
+                      short *stream1_in,
+                      short *ch_mag,
+                      short *ch_mag_i,
+                      short *stream0_out,
+                      short *rho01,
+                      int length);
+
 /** \brief This function perform LLR computation for dual-stream (64QAM/16QAM) transmission.
     @param frame_parms Frame descriptor structure
     @param rxdataF_comp Compensated channel output
@@ -746,6 +762,22 @@ void qam64_qam64(short *stream0_in,
                  short *rho01,
                  int length);
 
+/** \brief This function computes the LLRs for ML (max-logsum approximation) dual-stream 64QAM/64QAM reception.
+    @param stream0_in Input from channel compensated (MR combined) stream 0
+    @param stream1_in Input from channel compensated (MR combined) stream 1
+    @param ch_mag   Input from scaled channel magnitude square of h0'*g0
+    @param ch_mag_i Input from scaled channel magnitude square of h0'*g1
+    @param stream0_out Output from LLR unit for stream0
+    @param rho01 Cross-correlation between channels (MR combined)
+    @param length in complex channel outputs*/
+void qam64_qam64_avx2(int32_t *stream0_in,
+                      int32_t *stream1_in,
+                      int32_t *ch_mag,
+                      int32_t *ch_mag_i,
+                      int16_t *stream0_out,
+                      int32_t *rho01,
+                      int length);
+
 /** \brief This function perform LLR computation for dual-stream (64QAM/64QAM) transmission.
     @param frame_parms Frame descriptor structure
     @param rxdataF_comp Compensated channel output
diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_coding.c b/openair1/PHY/LTE_TRANSPORT/ulsch_coding.c
index 9571ea6cd52823d8409a2781bdd1c2ab5fc7c6e4..dbb7a16ac9cbb983a650b5811e9cfc594e2245e3 100644
--- a/openair1/PHY/LTE_TRANSPORT/ulsch_coding.c
+++ b/openair1/PHY/LTE_TRANSPORT/ulsch_coding.c
@@ -233,7 +233,7 @@ uint32_t ulsch_encoding(uint8_t *a,
   LTE_DL_FRAME_PARMS *frame_parms=&ue->frame_parms;
   PHY_MEASUREMENTS *meas = &ue->measurements;
   LTE_UE_ULSCH_t *ulsch=ue->ulsch[eNB_id];
-  LTE_UE_DLSCH_t **dlsch = ue->dlsch[eNB_id];
+  LTE_UE_DLSCH_t **dlsch = ue->dlsch[0][eNB_id];
   uint16_t rnti = 0xffff;
 
   if (!ulsch) {
@@ -966,7 +966,7 @@ int ulsch_encoding_emul(uint8_t *ulsch_buffer,
 {
 
   LTE_UE_ULSCH_t *ulsch = ue->ulsch[eNB_id];
-  LTE_UE_DLSCH_t **dlsch = ue->dlsch[eNB_id];
+  LTE_UE_DLSCH_t **dlsch = ue->dlsch[0][eNB_id];
   PHY_MEASUREMENTS *meas = &ue->measurements;
   uint8_t tmode = ue->transmission_mode[eNB_id];
   uint16_t rnti=ue->pdcch_vars[eNB_id]->crnti;
diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c b/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
index ca67ccdb3c9c34dbf01b7752201ebce456b34017..bb2a5b3e224652d3fc62c6f2594743d88dc30cc1 100644
--- a/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
+++ b/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
@@ -2066,7 +2066,7 @@ uint32_t ulsch_decoding_emul(PHY_VARS_eNB *eNB, eNB_rxtx_proc_t *proc,
     // get local ue's ack
     if ((UE_index >= oai_emulation.info.first_ue_local) ||(UE_index <(oai_emulation.info.first_ue_local+oai_emulation.info.nb_ue_local))) {
       get_ack(&eNB->frame_parms,
-              PHY_vars_UE_g[UE_id][CC_id]->dlsch[0][0]->harq_ack,
+              PHY_vars_UE_g[UE_id][CC_id]->dlsch[0][0][0]->harq_ack,
               subframe,
               eNB->ulsch[UE_index]->harq_processes[harq_pid]->o_ACK,0);
     } else { // get remote UEs' ack
diff --git a/openair1/PHY/Makefile.inc b/openair1/PHY/Makefile.inc
index 1586f353c0f623853f6149ca91f313630847b92a..90094b31f8605946ba6c18f31eb070b4ff99f484 100644
--- a/openair1/PHY/Makefile.inc
+++ b/openair1/PHY/Makefile.inc
@@ -7,6 +7,7 @@ PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_coding.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_modulation.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_demodulation.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_llr_computation.o
+PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_llr_computation_avx2.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/power_control.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_decoding.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_scrambling.o
diff --git a/openair1/PHY/TOOLS/lte_phy_scope.c b/openair1/PHY/TOOLS/lte_phy_scope.c
index ca08f875190d18a8def154faadd11d18418299c2..7a533464530f337d4aeba89722164670a830be63 100644
--- a/openair1/PHY/TOOLS/lte_phy_scope.c
+++ b/openair1/PHY/TOOLS/lte_phy_scope.c
@@ -510,16 +510,16 @@ void phy_scope_UE(FD_lte_phy_scope_ue *form,
   int beamforming_mode = phy_vars_ue->transmission_mode[eNB_id]>6 ? phy_vars_ue->transmission_mode[eNB_id] : 0;
 
 
-  if (phy_vars_ue->dlsch[eNB_id][0]!=NULL) {
-    harq_pid = phy_vars_ue->dlsch[eNB_id][0]->current_harq_pid;
+  if (phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]!=NULL) {
+    harq_pid = phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->current_harq_pid;
 
     if (harq_pid>=8)
       return;
 
-    mcs = phy_vars_ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->mcs;
+    mcs = phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->mcs;
 
     // Button 0
-    if(!phy_vars_ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->dl_power_off) {
+    if(!phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->dl_power_off) {
       // we are in TM5
       fl_show_object(form->button_0);
     }
@@ -530,12 +530,12 @@ void phy_scope_UE(FD_lte_phy_scope_ue *form,
   }
 
   //    coded_bits_per_codeword = frame_parms->N_RB_DL*12*get_Qm(mcs)*(frame_parms->symbols_per_tti);
-  if (phy_vars_ue->dlsch[eNB_id][0]!=NULL) {
+  if (phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]!=NULL) {
     coded_bits_per_codeword = get_G(frame_parms,
-                                    phy_vars_ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->nb_rb,
-                                    phy_vars_ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->rb_alloc_even,
+                                    phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->nb_rb,
+                                    phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->rb_alloc_even,
                                     get_Qm(mcs),
-                                    phy_vars_ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->Nl,
+                                    phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->Nl,
                                     num_pdcch_symbols,
                                     frame,
                                     subframe,
diff --git a/openair1/PHY/TOOLS/lte_phy_scope_tm4.c b/openair1/PHY/TOOLS/lte_phy_scope_tm4.c
index 60c5ace2c5379471d7746421daad79f9a1844cfc..6807f1f6f48a6de4ee057c9c2341b80084526448 100755
--- a/openair1/PHY/TOOLS/lte_phy_scope_tm4.c
+++ b/openair1/PHY/TOOLS/lte_phy_scope_tm4.c
@@ -448,11 +448,11 @@ void phy_scope_UE(FD_lte_phy_scope_ue *form,
     int mcs1=0;
     unsigned char harq_pid = 0;
     int beamforming_mode = phy_vars_ue->transmission_mode[eNB_id]>6 ? phy_vars_ue->transmission_mode[eNB_id] : 0;
-    if (phy_vars_ue->dlsch[eNB_id][0]!=NULL) {
-        harq_pid = phy_vars_ue->dlsch[eNB_id][0]->current_harq_pid;
+    if (phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]!=NULL) {
+        harq_pid = phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->current_harq_pid;
   if (harq_pid>=8)
     return;
-    mcs0 = phy_vars_ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->mcs;
+    mcs0 = phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->mcs;
         // Button 0
   /*
         if(!phy_vars_ue->dlsch_ue[eNB_id][0]->harq_processes[harq_pid]->dl_power_off) {
@@ -461,23 +461,23 @@ void phy_scope_UE(FD_lte_phy_scope_ue *form,
         }
   */
     }
-       if (phy_vars_ue->dlsch[eNB_id][1]!=NULL) {
-        harq_pid = phy_vars_ue->dlsch[eNB_id][1]->current_harq_pid;
+       if (phy_vars_ue->dlsch[subframe&0x1][eNB_id][1]!=NULL) {
+        harq_pid = phy_vars_ue->dlsch[subframe&0x1][eNB_id][1]->current_harq_pid;
   if (harq_pid>=8)
     return;
-    mcs1 = phy_vars_ue->dlsch[eNB_id][1]->harq_processes[harq_pid]->mcs;
+    mcs1 = phy_vars_ue->dlsch[subframe&0x1][eNB_id][1]->harq_processes[harq_pid]->mcs;
     }
     if (phy_vars_ue->pdcch_vars[eNB_id]!=NULL) {
         num_pdcch_symbols = phy_vars_ue->pdcch_vars[eNB_id]->num_pdcch_symbols;
     }
     //    coded_bits_per_codeword = frame_parms->N_RB_DL*12*get_Qm(mcs)*(frame_parms->symbols_per_tti);
-    if (phy_vars_ue->dlsch[eNB_id][0]!=NULL) {
+    if (phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]!=NULL) {
       mod0 = get_Qm(mcs0);
       coded_bits_per_codeword0 = get_G(frame_parms,
-              phy_vars_ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->nb_rb,
-              phy_vars_ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->rb_alloc_even,
+              phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->nb_rb,
+              phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->rb_alloc_even,
               get_Qm(mcs0),
-              phy_vars_ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->Nl,
+              phy_vars_ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->Nl,
               num_pdcch_symbols,
               frame,
               subframe,
@@ -486,13 +486,13 @@ void phy_scope_UE(FD_lte_phy_scope_ue *form,
       coded_bits_per_codeword0 = 0; //frame_parms->N_RB_DL*12*get_Qm(mcs)*(frame_parms->symbols_per_tti);
       mod0=0;
     }
-    if (phy_vars_ue->dlsch[eNB_id][1]!=NULL) {
+    if (phy_vars_ue->dlsch[subframe&0x1][eNB_id][1]!=NULL) {
       mod1 = get_Qm(mcs1);
       coded_bits_per_codeword1 = get_G(frame_parms,
-               phy_vars_ue->dlsch[eNB_id][1]->harq_processes[harq_pid]->nb_rb,
-               phy_vars_ue->dlsch[eNB_id][1]->harq_processes[harq_pid]->rb_alloc_even,
+               phy_vars_ue->dlsch[subframe&0x1][eNB_id][1]->harq_processes[harq_pid]->nb_rb,
+               phy_vars_ue->dlsch[subframe&0x1][eNB_id][1]->harq_processes[harq_pid]->rb_alloc_even,
                get_Qm(mcs1),
-               phy_vars_ue->dlsch[eNB_id][1]->harq_processes[harq_pid]->Nl,
+               phy_vars_ue->dlsch[subframe&0x1][eNB_id][1]->harq_processes[harq_pid]->Nl,
                num_pdcch_symbols,
                frame,
                subframe,
diff --git a/openair1/PHY/defs.h b/openair1/PHY/defs.h
index d70bfb3f722df0bcf526266d633b932bb534057b..5c1b1158ba45a62f9a206e45022a746a5bd23db7 100644
--- a/openair1/PHY/defs.h
+++ b/openair1/PHY/defs.h
@@ -706,7 +706,7 @@ typedef struct {
   LTE_DL_FRAME_PARMS  frame_parms_before_ho;
   LTE_UE_COMMON    common_vars;
 
-  LTE_UE_PDSCH     *pdsch_vars[2][NUMBER_OF_CONNECTED_eNB_MAX+1];
+  LTE_UE_PDSCH     *pdsch_vars[2][NUMBER_OF_CONNECTED_eNB_MAX+1]; // two RxTx Threads
   LTE_UE_PDSCH_FLP *pdsch_vars_flp[NUMBER_OF_CONNECTED_eNB_MAX+1];
   LTE_UE_PDSCH     *pdsch_vars_SI[NUMBER_OF_CONNECTED_eNB_MAX+1];
   LTE_UE_PDSCH     *pdsch_vars_ra[NUMBER_OF_CONNECTED_eNB_MAX+1];
@@ -715,7 +715,7 @@ typedef struct {
   LTE_UE_PBCH      *pbch_vars[NUMBER_OF_CONNECTED_eNB_MAX];
   LTE_UE_PDCCH     *pdcch_vars[NUMBER_OF_CONNECTED_eNB_MAX];
   LTE_UE_PRACH     *prach_vars[NUMBER_OF_CONNECTED_eNB_MAX];
-  LTE_UE_DLSCH_t   *dlsch[NUMBER_OF_CONNECTED_eNB_MAX][2];
+  LTE_UE_DLSCH_t   *dlsch[2][NUMBER_OF_CONNECTED_eNB_MAX][2]; // two RxTx Threads
   LTE_UE_ULSCH_t   *ulsch[NUMBER_OF_CONNECTED_eNB_MAX];
   LTE_UE_DLSCH_t   *dlsch_SI[NUMBER_OF_CONNECTED_eNB_MAX];
   LTE_UE_DLSCH_t   *dlsch_ra[NUMBER_OF_CONNECTED_eNB_MAX];
@@ -802,6 +802,8 @@ typedef struct {
   uint8_t               prach_cnt;
   uint8_t               prach_PreambleIndex;
   //  uint8_t               prach_timer;
+  uint8_t               decode_SIB;
+  uint8_t               decode_MIB;
   int              rx_offset; /// Timing offset
   int              rx_offset_diff; /// Timing adjustment for ofdm symbol0 on HW USRP
   int              timing_advance; ///timing advance signalled from eNB
@@ -872,7 +874,7 @@ typedef struct {
 
   time_stats_t phy_proc;
   time_stats_t phy_proc_tx;
-  time_stats_t phy_proc_rx;
+  time_stats_t phy_proc_rx[2];
 
   uint32_t use_ia_receiver;
 
@@ -885,6 +887,10 @@ typedef struct {
   time_stats_t ulsch_interleaving_stats;
   time_stats_t ulsch_multiplexing_stats;
 
+  time_stats_t generic_stat;
+  time_stats_t pdsch_procedures_stat;
+  time_stats_t dlsch_procedures_stat;
+
   time_stats_t ofdm_demod_stats;
   time_stats_t dlsch_rx_pdcch_stats;
   time_stats_t rx_dft_stats;
diff --git a/openair1/SCHED/phy_procedures_lte_ue.c b/openair1/SCHED/phy_procedures_lte_ue.c
index cd8fbdbc8eec0b3130e91eb6645f9ca69f6f8278..4a75d5d9e7aedd9fdf70e7a889747be0df93ce8d 100644
--- a/openair1/SCHED/phy_procedures_lte_ue.c
+++ b/openair1/SCHED/phy_procedures_lte_ue.c
@@ -75,6 +75,7 @@ fifo_dump_emos_UE emos_dump_UE;
 
 extern int oai_exit;
 
+extern double cpuf;
 
 
 
@@ -90,10 +91,10 @@ void dump_dlsch(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uint8_t subf
   uint8_t nsymb = (ue->frame_parms.Ncp == 0) ? 14 : 12;
 
   coded_bits_per_codeword = get_G(&ue->frame_parms,
-                                  ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->nb_rb,
-                                  ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->rb_alloc_even,
-                                  ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->Qm,
-                                  ue->dlsch[eNB_id][0]->harq_processes[harq_pid]->Nl,
+                                  ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->nb_rb,
+                                  ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->rb_alloc_even,
+                                  ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->Qm,
+                                  ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[harq_pid]->Nl,
                                   ue->pdcch_vars[eNB_id]->num_pdcch_symbols,
                                   proc->frame_rx,
 				  subframe,
@@ -268,37 +269,39 @@ void phy_reset_ue(uint8_t Mod_id,uint8_t CC_id,uint8_t eNB_index)
   uint8_t i,j,k,s;
   PHY_VARS_UE *ue = PHY_vars_UE_g[Mod_id][CC_id];
 
-  //[NUMBER_OF_CONNECTED_eNB_MAX][2];
-  for(i=0; i<NUMBER_OF_CONNECTED_eNB_MAX; i++) {
-    for(j=0; j<2; j++) {
-      //DL HARQ
-      if(ue->dlsch[i][j]) {
-        for(k=0; k<NUMBER_OF_HARQ_PID_MAX && ue->dlsch[i][j]->harq_processes[k]; k++) {
-          ue->dlsch[i][j]->harq_processes[k]->status = SCH_IDLE;
-          for (s=0; s<10; s++) {
-            // reset ACK/NACK bit to DTX for all subframes s = 0..9
-            ue->dlsch[i][j]->harq_ack[s].ack = 2;
-            ue->dlsch[i][j]->harq_ack[s].send_harq_status = 0;
-            ue->dlsch[i][j]->harq_ack[s].vDAI_UL = 0xff;
-            ue->dlsch[i][j]->harq_ack[s].vDAI_DL = 0xff;
+  //[NUMBER_OF_RX_THREAD=2][NUMBER_OF_CONNECTED_eNB_MAX][2];
+  for(int l=0; l<2; l++) {
+      for(i=0; i<NUMBER_OF_CONNECTED_eNB_MAX; i++) {
+          for(j=0; j<2; j++) {
+              //DL HARQ
+              if(ue->dlsch[l][i][j]) {
+                  for(k=0; k<NUMBER_OF_HARQ_PID_MAX && ue->dlsch[l][i][j]->harq_processes[k]; k++) {
+                      ue->dlsch[l][i][j]->harq_processes[k]->status = SCH_IDLE;
+                      for (s=0; s<10; s++) {
+                          // reset ACK/NACK bit to DTX for all subframes s = 0..9
+                          ue->dlsch[l][i][j]->harq_ack[s].ack = 2;
+                          ue->dlsch[l][i][j]->harq_ack[s].send_harq_status = 0;
+                          ue->dlsch[l][i][j]->harq_ack[s].vDAI_UL = 0xff;
+                          ue->dlsch[l][i][j]->harq_ack[s].vDAI_DL = 0xff;
+                      }
+                  }
+              }
           }
-        }
-      }
-    }
 
-    //UL HARQ
-    if(ue->ulsch[i]) {
-      for(k=0; k<NUMBER_OF_HARQ_PID_MAX && ue->ulsch[i]->harq_processes[k]; k++) {
-        ue->ulsch[i]->harq_processes[k]->status = SCH_IDLE;
-        //Set NDIs for all UL HARQs to 0
-        //  ue->ulsch[i]->harq_processes[k]->Ndi = 0;
+          //UL HARQ
+          if(ue->ulsch[i]) {
+              for(k=0; k<NUMBER_OF_HARQ_PID_MAX && ue->ulsch[i]->harq_processes[k]; k++) {
+                  ue->ulsch[i]->harq_processes[k]->status = SCH_IDLE;
+                  //Set NDIs for all UL HARQs to 0
+                  //  ue->ulsch[i]->harq_processes[k]->Ndi = 0;
 
-      }
-    }
+              }
+          }
 
-    // flush Msg3 buffer
-    ue->ulsch_Msg3_active[i] = 0;
+          // flush Msg3 buffer
+          ue->ulsch_Msg3_active[i] = 0;
 
+      }
   }
 }
 
@@ -327,7 +330,8 @@ void ra_succeeded(uint8_t Mod_id,uint8_t CC_id,uint8_t eNB_index)
   for (i=0; i<8; i++) {
     if (PHY_vars_UE_g[Mod_id][CC_id]->ulsch[eNB_index]->harq_processes[i]) {
       PHY_vars_UE_g[Mod_id][CC_id]->ulsch[eNB_index]->harq_processes[i]->status=IDLE;
-      PHY_vars_UE_g[Mod_id][CC_id]->dlsch[eNB_index][0]->harq_processes[i]->round=0;
+      PHY_vars_UE_g[Mod_id][CC_id]->dlsch[0][eNB_index][0]->harq_processes[i]->round=0;
+      PHY_vars_UE_g[Mod_id][CC_id]->dlsch[1][eNB_index][0]->harq_processes[i]->round=0;
     }
   }
 
@@ -638,7 +642,7 @@ void ue_compute_srs_occasion(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id
 
               uint8_t pucch_ack_payload[2];
               if (get_ack(&ue->frame_parms,
-                      ue->dlsch[eNB_id][0]->harq_ack,
+                      ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->harq_ack,
                       subframe_tx,pucch_ack_payload,0) > 0)
               {
                   is_sr_an_subframe = 1;
@@ -790,6 +794,7 @@ PUCCH_FMT_t get_pucch_format(lte_frame_type_t frame_type,
           return pucch_format2;
       }
   }
+  return pucch_format1a;
 }
 uint16_t get_n1_pucch(PHY_VARS_UE *ue,
 		      UE_rxtx_proc_t *proc,
@@ -929,11 +934,11 @@ uint16_t get_n1_pucch(PHY_VARS_UE *ue,
       n1_pucch1 = get_Np(frame_parms->N_RB_DL,nCCE1,1) + nCCE1 + frame_parms->pucch_config_common.n1PUCCH_AN;
 
       // set ACK/NAK to values if not DTX
-      if (ue->dlsch[eNB_id][0]->harq_ack[(6+last_dl)%10].send_harq_status>0)  // n-6 // subframe 6 is to be ACK/NAKed
-        harq_ack1 = ue->dlsch[eNB_id][0]->harq_ack[(6+last_dl)%10].ack;
+      if (ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->harq_ack[(6+last_dl)%10].send_harq_status>0)  // n-6 // subframe 6 is to be ACK/NAKed
+        harq_ack1 = ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->harq_ack[(6+last_dl)%10].ack;
 
-      if (ue->dlsch[eNB_id][0]->harq_ack[5+last_dl].send_harq_status>0)  // n-6 // subframe 5 is to be ACK/NAKed
-        harq_ack0 = ue->dlsch[eNB_id][0]->harq_ack[5+last_dl].ack;
+      if (ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->harq_ack[5+last_dl].send_harq_status>0)  // n-6 // subframe 5 is to be ACK/NAKed
+        harq_ack0 = ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->harq_ack[5+last_dl].ack;
 
 
       if (harq_ack1!=2) { // n-6 // subframe 6,8,0 and maybe 5,7,9 is to be ACK/NAKed
@@ -1040,10 +1045,10 @@ void ulsch_common_procedures(PHY_VARS_UE *ue, UE_rxtx_proc_t *proc, uint8_t empt
   int subframe_tx = proc->subframe_tx;
   int frame_tx = proc->frame_tx;
   int ulsch_start;
-  int overflow=0;
 #if defined(EXMIMO) || defined(OAI_USRP) || defined(OAI_BLADERF) || defined(OAI_LMSSDR)
+  int overflow=0;
   int k,l;
-  int dummy_tx_buffer[frame_parms->samples_per_tti] __attribute__((aligned(16)));
+  int dummy_tx_buffer[3840*4] __attribute__((aligned(16)));
 #endif
 
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_UE_TX_ULSCH_COMMON,VCD_FUNCTION_IN);
@@ -1069,22 +1074,18 @@ void ulsch_common_procedures(PHY_VARS_UE *ue, UE_rxtx_proc_t *proc, uint8_t empt
   ulsch_start = (frame_parms->samples_per_tti*subframe_tx)-ue->N_TA_offset; //-ue->timing_advance;
 #endif //else EXMIMO
 
-//#if defined(EXMIMO) || defined(OAI_USRP) || defined(OAI_BLADERF) || defined(OAI_LMSSDR)
+#if defined(EXMIMO) || defined(OAI_USRP) || defined(OAI_BLADERF) || defined(OAI_LMSSDR)
   if (empty_subframe)
   {
 //#if 1
       overflow = ulsch_start - 9*frame_parms->samples_per_tti;
       for (aa=0; aa<frame_parms->nb_antennas_tx; aa++) {
 
-          if (overflow > 0)
-		 {
-			 memset(&ue->common_vars.txdata[aa][ulsch_start],0,4*(frame_parms->samples_per_tti-overflow));
-			 memset(&ue->common_vars.txdata[aa][0],0,4*overflow);
-		 }
-		 else
-		 {
-			 memset(&ue->common_vars.txdata[aa][ulsch_start],0,4*frame_parms->samples_per_tti);
-		 }
+          memset(&ue->common_vars.txdata[aa][ulsch_start],0,
+                 4*cmin(frame_parms->samples_per_tti-overflow,frame_parms->samples_per_tti));
+
+          if (overflow> 0)
+              memset(&ue->common_vars.txdata[aa][0],0,4*overflow);
       }
 /*#else
       overflow = ulsch_start - 9*frame_parms->samples_per_tti;
@@ -1102,7 +1103,7 @@ void ulsch_common_procedures(PHY_VARS_UE *ue, UE_rxtx_proc_t *proc, uint8_t empt
 #endif*/
       return;
   }
-//#endif
+#endif
 
   if ((frame_tx%100) == 0)
     LOG_D(PHY,"[UE %d] Frame %d, subframe %d: ulsch_start = %d (rxoff %d, HW TA %d, timing advance %d, TA_offset %d\n",
@@ -1336,7 +1337,8 @@ void ue_ulsch_uespec_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB
   uint8_t ulsch_input_buffer[5477] __attribute__ ((aligned(32)));
   uint8_t access_mode;
   uint8_t Nbundled=0;
-  uint8_t ack_status=0;
+  uint8_t ack_status_cw0=0;
+  uint8_t ack_status_cw1=0;
 
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_UE_TX_ULSCH_UESPEC,VCD_FUNCTION_IN);
 
@@ -1437,11 +1439,16 @@ void ue_ulsch_uespec_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB
         ue->ulsch[eNB_id]->harq_processes[harq_pid]->round  = 0;
     }
     
-    ack_status = reset_ack(&ue->frame_parms,
-			 ue->dlsch[eNB_id][0]->harq_ack,
-			 subframe_tx,
-			 ue->ulsch[eNB_id]->o_ACK,0);
-    Nbundled = ack_status;
+    ack_status_cw0 = reset_ack(&ue->frame_parms,
+            ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->harq_ack,
+            subframe_tx,
+            ue->ulsch[eNB_id]->o_ACK,0);
+    ack_status_cw1 = reset_ack(&ue->frame_parms,
+            ue->dlsch[proc->subframe_rx&0x1][eNB_id][1]->harq_ack,
+            subframe_tx,
+            ue->ulsch[eNB_id]->o_ACK,1);
+
+    Nbundled = ack_status_cw0;
     first_rb = ue->ulsch[eNB_id]->harq_processes[harq_pid]->first_rb;
     nb_rb = ue->ulsch[eNB_id]->harq_processes[harq_pid]->nb_rb;
     
@@ -1449,31 +1456,31 @@ void ue_ulsch_uespec_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB
     
     
 
-    if (ack_status > 0) {
+    if (ack_status_cw0 > 0) {
 
       // check if we received a PDSCH at subframe_tx - 4
       // ==> send ACK/NACK on PUSCH
-      ue->ulsch[eNB_id]->harq_processes[harq_pid]->O_ACK = 1;
+      ue->ulsch[eNB_id]->harq_processes[harq_pid]->O_ACK = ack_status_cw0 + ack_status_cw1;
 
 #if T_TRACER
     if(ue->ulsch[eNB_id]->o_ACK[0])
     {
     	LOG_I(PHY,"PUSCH ACK\n");
-        T(T_UE_PHY_DLSCH_UE_ACK, T_INT(eNB_id), T_INT(frame_tx%1024), T_INT(subframe_tx), T_INT(Mod_id), T_INT(ue->dlsch[eNB_id][0]->rnti),
-                      T_INT(ue->dlsch[eNB_id][0]->current_harq_pid));
+        T(T_UE_PHY_DLSCH_UE_ACK, T_INT(eNB_id), T_INT(frame_tx%1024), T_INT(subframe_tx), T_INT(Mod_id), T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti),
+                      T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->current_harq_pid));
     }
     else
     {
     	LOG_I(PHY,"PUSCH NACK\n");
-        T(T_UE_PHY_DLSCH_UE_NACK, T_INT(eNB_id), T_INT(frame_tx%1024), T_INT(subframe_tx), T_INT(Mod_id), T_INT(ue->dlsch[eNB_id][0]->rnti),
-                      T_INT(ue->dlsch[eNB_id][0]->current_harq_pid));
+        T(T_UE_PHY_DLSCH_UE_NACK, T_INT(eNB_id), T_INT(frame_tx%1024), T_INT(subframe_tx), T_INT(Mod_id), T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti),
+                      T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->current_harq_pid));
     }
 #endif
 
-      LOG_D(PHY,"[UE  %d][PDSCH %x] AbsSubFrame %d.%d Generating ACK (%d,%d) for %d bits on PUSCH\n",
+      LOG_I(PHY,"[UE  %d][PDSCH %x] AbsSubFrame %d.%d Generating ACK (%d,%d) for %d bits on PUSCH\n",
         Mod_id,
         ue->ulsch[eNB_id]->rnti,
-        frame_tx%1024,subframe_tx,
+        frame_tx,subframe_tx,
         ue->ulsch[eNB_id]->o_ACK[0],ue->ulsch[eNB_id]->o_ACK[1],
         ue->ulsch[eNB_id]->harq_processes[harq_pid]->O_ACK);
     }
@@ -1530,6 +1537,7 @@ void ue_ulsch_uespec_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB
 	  mac_xface->macphy_exit("Error in ulsch_coding");
 	  VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_UE_TX, VCD_FUNCTION_OUT);
 	  stop_meas(&ue->phy_proc_tx);
+	  //printf("------FULL TX PROC : %5.2f ------\n",ue->phy_proc_tx.p_time/(cpuf*1000.0));
 	  return;
 	}
       }
@@ -1636,8 +1644,8 @@ void ue_ulsch_uespec_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB
       T(T_UE_PHY_PUSCH_TX_POWER, T_INT(eNB_id),T_INT(Mod_id), T_INT(frame_tx%1024), T_INT(subframe_tx),T_INT(ue->tx_power_dBm[subframe_tx]),
                     T_INT(tx_amp),T_INT(ue->ulsch[eNB_id]->f_pusch),T_INT(get_PL(Mod_id,0,eNB_id)),T_INT(nb_rb));
 #endif
-      LOG_D(PHY,"[UE  %d][PUSCH %d] AbsSubFrame %d.%d, generating PUSCH, Po_PUSCH: %d dBm (max %d dBm), amp %d\n",
-	    Mod_id,harq_pid,frame_tx%1024,subframe_tx,ue->tx_power_dBm[subframe_tx],ue->tx_power_max_dBm, tx_amp);
+      LOG_D(PHY,"[UE  %d][PUSCH %d] Frame %d subframe %d, generating PUSCH, Po_PUSCH: %d dBm (max %d dBm), amp %d\n",
+	    Mod_id,harq_pid,frame_tx,subframe_tx,ue->tx_power_dBm[subframe_tx],ue->tx_power_max_dBm, tx_amp);
       start_meas(&ue->ulsch_modulation_stats);
       ulsch_modulation(ue->common_vars.txdataF,
 		       tx_amp,
@@ -1770,13 +1778,13 @@ void get_pucch_param(PHY_VARS_UE    *ue,
     {
         pucch_resource[0] = get_n1_pucch(ue,
                                          proc,
-                                         ue->dlsch[eNB_id][0]->harq_ack,
+                                         ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->harq_ack,
                                          eNB_id,
                                          ack_payload,
                                          SR);
         pucch_payload[0]  = ack_payload[0];
-        //pucch_payload[1]  = ack_payload[1];
-        pucch_payload[1]  = 1;
+        pucch_payload[1]  = ack_payload[1];
+        //pucch_payload[1]  = 1;
     }
     break;
 
@@ -1785,7 +1793,7 @@ void get_pucch_param(PHY_VARS_UE    *ue,
         pucch_resource[0]    = ue->cqi_report_config[eNB_id].CQI_ReportPeriodic.cqi_PUCCH_ResourceIndex;
         if(cqi_report)
         {
-            pucch_payload[0] = get_pucch2_cqi(ue,eNB_id,plength);
+            pucch_payload[0] = get_pucch2_cqi(ue,eNB_id,(int*)plength);
         }
         else
         {
@@ -1806,14 +1814,12 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
 
 
   uint8_t  pucch_ack_payload[2];
-  uint8_t  n1_pucch,n2_pucch;
+  uint8_t  n2_pucch;
   uint16_t pucch_resource;
   ANFBmode_t bundling_flag;
   PUCCH_FMT_t format;
 
   uint8_t  SR_payload;
-  uint16_t CQI_payload;
-  uint16_t RI_payload;
   uint8_t  pucch_payload[2];
   uint16_t len;
 
@@ -1824,13 +1830,11 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
   int CC_id = ue->CC_id;
   int tx_amp;
   int16_t Po_PUCCH;
-  uint8_t ack_status=0;
   uint8_t ack_status_cw0=0;
   uint8_t ack_status_cw1=0;
   uint8_t nb_cw=0;
   uint8_t cqi_status=0;
   uint8_t ri_status=0;
-  uint8_t ack_sr_generated = 0;
 
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_UE_TX_PUCCH,VCD_FUNCTION_IN);
   
@@ -1891,13 +1895,13 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
   }
 
   ack_status_cw0 = reset_ack(&ue->frame_parms,
-                       ue->dlsch[eNB_id][0]->harq_ack,
+                       ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->harq_ack,
                        subframe_tx,
                        pucch_ack_payload,
                        0);
 
   ack_status_cw1 = reset_ack(&ue->frame_parms,
-                       ue->dlsch[eNB_id][1]->harq_ack,
+                       ue->dlsch[proc->subframe_rx&0x1][eNB_id][1]->harq_ack,
                        subframe_tx,
                        pucch_ack_payload,
                        1);
@@ -1936,11 +1940,10 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
                   SR_payload,
                   cqi_status,
                   &pucch_resource,
-                  &pucch_payload,
+                  (uint8_t *)&pucch_payload,
                   &len);
 
-  LOG_D(PHY,"PUCCH feedback AbsSubframe %d.%d SR %d NbCW %d AckNack %d.%d CQI %d RI %d format %d pucch_resource %d pucch_payload %d %d \n",
-		  frame_tx%1024, subframe_tx, SR_payload, nb_cw, pucch_ack_payload[0], pucch_ack_payload[1], cqi_status, ri_status, format, pucch_resource,pucch_payload[0],pucch_payload[1]);
+  LOG_D(PHY,"PUCCH feedback AbsSubframe %d.%d SR %d NbCW %d AckNack %d.%d CQI %d RI %d format %d pucch_resource %d pucch_payload %d %d \n", frame_tx, subframe_tx, SR_payload, nb_cw, pucch_ack_payload[0], pucch_ack_payload[1], cqi_status, ri_status, format, pucch_resource,pucch_payload[0],pucch_payload[1]);
 
 
   // Part - IV
@@ -1971,13 +1974,13 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
 #endif
 #if T_TRACER
       T(T_UE_PHY_PUCCH_TX_POWER, T_INT(eNB_id),T_INT(Mod_id), T_INT(frame_tx%1024), T_INT(subframe_tx),T_INT(ue->tx_power_dBm[subframe_tx]),
-              T_INT(tx_amp),T_INT(ue->dlsch[eNB_id][0]->g_pucch),T_INT(get_PL(ue->Mod_id,ue->CC_id,eNB_id)));
+              T_INT(tx_amp),T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->g_pucch),T_INT(get_PL(ue->Mod_id,ue->CC_id,eNB_id)));
 #endif
       if(format == pucch_format1)
       {
           LOG_D(PHY,"[UE  %d][SR %x] AbsSubframe %d.%d Generating PUCCH 1 (SR for PUSCH), an_srs_simultanous %d, shorten_pucch %d, n1_pucch %d, Po_PUCCH %d\n",
                   Mod_id,
-                  ue->dlsch[eNB_id][0]->rnti,
+                  ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti,
                   frame_tx, subframe_tx,
                   frame_parms->soundingrs_ul_config_common.ackNackSRS_SimultaneousTransmission,
                   isShortenPucch,
@@ -1989,7 +1992,7 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
           if (SR_payload>0) {
               LOG_D(PHY,"[UE  %d][SR %x] AbsSubFrame %d.%d Generating PUCCH %s payload %d,%d (with SR for PUSCH), an_srs_simultanous %d, shorten_pucch %d, n1_pucch %d, Po_PUCCH %d, amp %d\n",
                       Mod_id,
-                      ue->dlsch[eNB_id][0]->rnti,
+                      ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti,
                       frame_tx % 1024, subframe_tx,
                       (format == pucch_format1a? "1a": (
                               format == pucch_format1b? "1b" : "??")),
@@ -2002,7 +2005,7 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
           } else {
               LOG_D(PHY,"[UE  %d][PDSCH %x] AbsSubFrame %d.%d rx_offset_diff: %d, Generating PUCCH %s, an_srs_simultanous %d, shorten_pucch %d, n1_pucch %d, b[0]=%d,b[1]=%d (SR_Payload %d), Po_PUCCH %d, amp %d\n",
                       Mod_id,
-                      ue->dlsch[eNB_id][0]->rnti,
+                      ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti,
                       frame_tx%1024, subframe_tx,ue->rx_offset_diff,
                       (format == pucch_format1a? "1a": (
                               format == pucch_format1b? "1b" : "??")),
@@ -2017,13 +2020,13 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
 #if T_TRACER
       if(pucch_payload[0])
       {
-          T(T_UE_PHY_DLSCH_UE_ACK, T_INT(eNB_id), T_INT(frame_tx%1024), T_INT(subframe_tx), T_INT(Mod_id), T_INT(ue->dlsch[eNB_id][0]->rnti),
-                  T_INT(ue->dlsch[eNB_id][0]->current_harq_pid));
+          T(T_UE_PHY_DLSCH_UE_ACK, T_INT(eNB_id), T_INT(frame_tx%1024), T_INT(subframe_tx), T_INT(Mod_id), T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti),
+                  T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->current_harq_pid));
       }
       else
       {
-          T(T_UE_PHY_DLSCH_UE_NACK, T_INT(eNB_id), T_INT(frame_tx%1024), T_INT(subframe_tx), T_INT(Mod_id), T_INT(ue->dlsch[eNB_id][0]->rnti),
-                  T_INT(ue->dlsch[eNB_id][0]->current_harq_pid));
+          T(T_UE_PHY_DLSCH_UE_NACK, T_INT(eNB_id), T_INT(frame_tx%1024), T_INT(subframe_tx), T_INT(Mod_id), T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti),
+                  T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->current_harq_pid));
       }
 #endif
 
@@ -2076,12 +2079,12 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
 #endif
 #if T_TRACER
       T(T_UE_PHY_PUCCH_TX_POWER, T_INT(eNB_id),T_INT(Mod_id), T_INT(frame_tx%1024), T_INT(subframe_tx),T_INT(ue->tx_power_dBm[subframe_tx]),
-              T_INT(tx_amp),T_INT(ue->dlsch[eNB_id][0]->g_pucch),T_INT(get_PL(ue->Mod_id,ue->CC_id,eNB_id)));
+              T_INT(tx_amp),T_INT(ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->g_pucch),T_INT(get_PL(ue->Mod_id,ue->CC_id,eNB_id)));
 #endif
 
       LOG_D(PHY,"[UE  %d][RNTI %x] AbsSubFrame %d.%d Generating PUCCH 2 (RI or CQI), n2_pucch %d, Po_PUCCH %d, isShortenPucch %d, amp %d\n",
               Mod_id,
-              ue->dlsch[eNB_id][0]->rnti,
+              ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti,
               frame_tx%1024, subframe_tx,
               n2_pucch,
               Po_PUCCH,
@@ -2106,13 +2109,13 @@ void ue_pucch_procedures(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
   case pucch_format2a:
       LOG_I(PHY,"[UE  %d][RNTI %x] AbsSubFrame %d.%d Generating PUCCH 2a (RI or CQI) Ack/Nack 1bit \n",
               Mod_id,
-              ue->dlsch[eNB_id][0]->rnti,
+              ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti,
               frame_tx%1024, subframe_tx);
       break;
   case pucch_format2b:
       LOG_I(PHY,"[UE  %d][RNTI %x] AbsSubFrame %d.%d Generating PUCCH 2b (RI or CQI) Ack/Nack 2bits\n",
               Mod_id,
-              ue->dlsch[eNB_id][0]->rnti,
+              ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti,
               frame_tx%1024, subframe_tx);
       break;
   default:
@@ -2254,7 +2257,7 @@ void phy_procedures_UE_TX(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,ui
     
   // reset DL ACK/NACK status
   reset_ack(&ue->frame_parms,
-             ue->dlsch[eNB_id][0]->harq_ack,
+             ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->harq_ack,
              subframe_tx,
              ue->ulsch[eNB_id]->o_ACK,0);
 
@@ -2451,8 +2454,8 @@ void phy_procedures_emos_UE_RX(PHY_VARS_UE *ue,uint8_t last_slot,uint8_t eNB_id)
     emos_dump_UE.total_TBS_last = ue->total_TBS_last[eNB_id];
     emos_dump_UE.bitrate = ue->bitrate[eNB_id];
     emos_dump_UE.total_received_bits = ue->total_received_bits[eNB_id];
-    emos_dump_UE.pmi_saved = ue->dlsch[eNB_id][0]->pmi_alloc;
-    emos_dump_UE.mcs = ue->dlsch[eNB_id][0]->harq_processes[ue->dlsch[eNB_id][0]->current_harq_pid]->mcs;
+    emos_dump_UE.pmi_saved = ue->dlsch[subframe&0x1][eNB_id][0]->pmi_alloc;
+    emos_dump_UE.mcs = ue->dlsch[subframe&0x1][eNB_id][0]->harq_processes[ue->dlsch[subframe&0x1][eNB_id][0]->current_harq_pid]->mcs;
     emos_dump_UE.use_ia_receiver = openair_daq_vars.use_ia_receiver;
 
     bytes = rtf_put(CHANSOUNDER_FIFO_MINOR, &emos_dump_UE, sizeof(fifo_dump_emos_UE));
@@ -2859,7 +2862,7 @@ int ue_pdcch_procedures(uint8_t eNB_id,PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint
 					     (void *)&dci_alloc_rx[i].dci_pdu,
 					     ue->pdcch_vars[eNB_id]->crnti,
 					     dci_alloc_rx[i].format,
-					     ue->dlsch[eNB_id],
+					     ue->dlsch[subframe_rx&0x1][eNB_id],
 					     &ue->frame_parms,
 					     ue->pdsch_config_dedicated,
 					     SI_RNTI,
@@ -2876,7 +2879,7 @@ int ue_pdcch_procedures(uint8_t eNB_id,PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint
               (dci_alloc_rx[i].format == format2A) ||
               (dci_alloc_rx[i].format == format2B))
           {
-            ue->dlsch[eNB_id][0]->g_pucch += ue->dlsch[eNB_id][0]->harq_processes[ue->dlsch[eNB_id][0]->current_harq_pid]->delta_PUCCH;
+            ue->dlsch[subframe_rx&0x1][eNB_id][0]->g_pucch += ue->dlsch[subframe_rx&0x1][eNB_id][0]->harq_processes[ue->dlsch[subframe_rx&0x1][eNB_id][0]->current_harq_pid]->delta_PUCCH;
           }
 
 	ue->dlsch_received[eNB_id]++;
@@ -2884,7 +2887,7 @@ int ue_pdcch_procedures(uint8_t eNB_id,PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint
 #ifdef DEBUG_PHY_PROC
 	LOG_D(PHY,"[UE  %d] Generated UE DLSCH C_RNTI format %d\n",ue->Mod_id,dci_alloc_rx[i].format);
 	dump_dci(&ue->frame_parms, &dci_alloc_rx[i]);
-	LOG_D(PHY,"[UE %d] *********** dlsch->active in subframe %d=> %d\n",ue->Mod_id,subframe_rx,ue->dlsch[eNB_id][0]->active);
+	LOG_D(PHY,"[UE %d] *********** dlsch->active in subframe %d=> %d\n",ue->Mod_id,subframe_rx,ue->dlsch[subframe_rx&0x1][eNB_id][0]->active);
 #endif
 	
 	// we received a CRNTI, so we're in PUSCH
@@ -3262,7 +3265,14 @@ void ue_pdsch_procedures(PHY_VARS_UE *ue, UE_rxtx_proc_t *proc, int eNB_id, PDSC
 	dual_stream_UE = 1;
 	eNB_id_i = ue->n_connected_eNB;
 	i_mod =  dlsch0->harq_processes[harq_pid]->Qm;
-      } else {
+      }
+      else if((pdsch==PDSCH) && (ue->transmission_mode[eNB_id]==3))
+      {
+          dual_stream_UE = rx_IC_dual_stream;
+          eNB_id_i       = eNB_id;
+          i_mod          = 0;
+      }
+      else {
 	dual_stream_UE = 0;
 	eNB_id_i = eNB_id+1;
 	i_mod = 0;
@@ -3401,11 +3411,34 @@ void ue_dlsch_procedures(PHY_VARS_UE *ue,
   int harq_pid;
   int frame_rx = proc->frame_rx;
   int subframe_rx = proc->subframe_rx;
-  int ret=0;
+  int ret=0, ret1=0;
   int CC_id = ue->CC_id;
   LTE_UE_PDSCH *pdsch_vars;
+  uint8_t is_cw0_active = 0;
+  uint8_t is_cw1_active = 0;
+
+  if (dlsch0==NULL)
+      AssertFatal(0,"dlsch0 should be defined at this level \n");
+
+  harq_pid = dlsch0->current_harq_pid;
+  is_cw0_active = dlsch0->harq_processes[harq_pid]->status;
+
+  if(dlsch1)
+    is_cw1_active = dlsch1->harq_processes[harq_pid]->status;
+
+  LOG_D(PHY,"AbsSubframe %d.%d Start Turbo Decoder for CW0 [harq_pid %d] ? %d \n", frame_rx%1024, subframe_rx, harq_pid, is_cw0_active);
+  LOG_D(PHY,"AbsSubframe %d.%d Start Turbo Decoder for CW1 [harq_pid %d] ? %d \n", frame_rx%1024, subframe_rx, harq_pid, is_cw1_active);
 
-  if (dlsch0 && (!dlsch1)) {
+  if(is_cw0_active && is_cw1_active)
+  {
+      dlsch0->Kmimo = 2;
+      dlsch1->Kmimo = 2;
+  }
+  else
+  {
+      dlsch0->Kmimo = 1;
+  }
+  if (1) {
     switch (pdsch) {
     case SI_PDSCH:
       pdsch_vars = ue->pdsch_vars_SI[eNB_id];
@@ -3431,8 +3464,6 @@ void ue_dlsch_procedures(PHY_VARS_UE *ue,
       break;
 
     }
-  
-    harq_pid = dlsch0->current_harq_pid;
 
     if (frame_rx < *dlsch_errors)
       *dlsch_errors=0;
@@ -3449,6 +3480,7 @@ void ue_dlsch_procedures(PHY_VARS_UE *ue,
 
     if (abstraction_flag == 0) {
 
+      // start turbo decode for CW 0
       dlsch0->harq_processes[harq_pid]->G = get_G(&ue->frame_parms,
 						  dlsch0->harq_processes[harq_pid]->nb_rb,
 						  dlsch0->harq_processes[harq_pid]->rb_alloc_even,
@@ -3468,6 +3500,13 @@ void ue_dlsch_procedures(PHY_VARS_UE *ue,
 			 subframe_rx<<1);
       stop_meas(&ue->dlsch_unscrambling_stats);
       
+      //LOG_I(PHY,"start turbo decode for CW 0 --> nb_rb %d \n", dlsch0->harq_processes[harq_pid]->nb_rb);
+      //LOG_I(PHY,"start turbo decode for CW 0 --> rb_alloc_even %x \n", dlsch0->harq_processes[harq_pid]->rb_alloc_even);
+      //LOG_I(PHY,"start turbo decode for CW 0 --> Qm %d \n", dlsch0->harq_processes[harq_pid]->Qm);
+      //LOG_I(PHY,"start turbo decode for CW 0 --> Nl %d \n", dlsch0->harq_processes[harq_pid]->Nl);
+      //LOG_I(PHY,"start turbo decode for CW 0 --> G  %d \n", dlsch0->harq_processes[harq_pid]->G);
+      //LOG_I(PHY,"start turbo decode for CW 0 --> Kmimo  %d \n", dlsch0->Kmimo);
+
       start_meas(&ue->dlsch_decoding_stats);
       ret = dlsch_decoding(ue,
 			   pdsch_vars->llr[0],
@@ -3480,6 +3519,60 @@ void ue_dlsch_procedures(PHY_VARS_UE *ue,
 			   pdsch==PDSCH?1:0,
 			   dlsch0->harq_processes[harq_pid]->TBS>256?1:0);
       stop_meas(&ue->dlsch_decoding_stats);
+
+      //printf(" --> Unscrambling for CW0 %5.3f\n",
+      //        (ue->dlsch_unscrambling_stats.p_time)/(cpuf*1000.0));
+      //printf(" --> Turbo Decoding for CW0 %5.3f\n",
+      //        (ue->dlsch_decoding_stats.p_time)/(cpuf*1000.0));
+
+      if(is_cw1_active)
+      {
+          // start turbo decode for CW 1
+          dlsch1->harq_processes[harq_pid]->G = get_G(&ue->frame_parms,
+                  dlsch1->harq_processes[harq_pid]->nb_rb,
+                  dlsch1->harq_processes[harq_pid]->rb_alloc_even,
+                  dlsch1->harq_processes[harq_pid]->Qm,
+                  dlsch1->harq_processes[harq_pid]->Nl,
+                  ue->pdcch_vars[eNB_id]->num_pdcch_symbols,
+                  frame_rx,
+                  subframe_rx,
+                  ue->transmission_mode[eNB_id]<7?0:ue->transmission_mode[eNB_id]);
+
+          start_meas(&ue->dlsch_unscrambling_stats);
+          dlsch_unscrambling(&ue->frame_parms,
+                  0,
+                  dlsch1,
+                  dlsch1->harq_processes[harq_pid]->G,
+                  pdsch_vars->llr[1],
+                  1,
+                  subframe_rx<<1);
+          stop_meas(&ue->dlsch_unscrambling_stats);
+
+          //LOG_I(PHY,"start turbo decode for CW 1 --> nb_rb %d \n", dlsch1->harq_processes[harq_pid]->nb_rb);
+          //LOG_I(PHY,"start turbo decode for CW 1 --> rb_alloc_even %x \n", dlsch1->harq_processes[harq_pid]->rb_alloc_even);
+          //LOG_I(PHY,"start turbo decode for CW 1 --> Qm %d \n", dlsch1->harq_processes[harq_pid]->Qm);
+          //LOG_I(PHY,"start turbo decode for CW 1 --> Nl %d \n", dlsch1->harq_processes[harq_pid]->Nl);
+          //LOG_I(PHY,"start turbo decode for CW 1 --> G  %d \n", dlsch1->harq_processes[harq_pid]->G);
+          //LOG_I(PHY,"start turbo decode for CW 1 --> Kmimo  %d \n", dlsch1->Kmimo);
+
+          start_meas(&ue->dlsch_decoding_stats);
+          ret1 = dlsch_decoding(ue,
+                  pdsch_vars->llr[1],
+                  &ue->frame_parms,
+                  dlsch1,
+                  dlsch1->harq_processes[harq_pid],
+                  frame_rx,
+                  subframe_rx,
+                  harq_pid,
+                  pdsch==PDSCH?1:0,
+                  dlsch1->harq_processes[harq_pid]->TBS>256?1:0);
+          stop_meas(&ue->dlsch_decoding_stats);
+
+          //printf(" --> Unscrambling for CW1 %5.3f\n",
+          //        (ue->dlsch_unscrambling_stats.p_time)/(cpuf*1000.0));
+          //printf(" --> Turbo Decoding for CW1 %5.3f\n",
+          //        (ue->dlsch_decoding_stats.p_time)/(cpuf*1000.0));
+      }
     }
 	
     else {
@@ -3492,12 +3585,13 @@ void ue_dlsch_procedures(PHY_VARS_UE *ue,
 #endif
     }
 	
+    // Check CRC for CW 0
     if (ret == (1+dlsch0->max_turbo_iterations)) {
       *dlsch_errors=*dlsch_errors+1;
       
       if(dlsch0->rnti != 0xffff)
       {
-      LOG_D(PHY,"[UE  %d][PDSCH %x/%d] Frame %d subframe %d DLSCH in error (rv %d,mcs %d,TBS %d)\n",
+      LOG_D(PHY,"[UE  %d][PDSCH %x/%d] AbsSubframe %d.%d : DLSCH CW0 in error (rv %d,mcs %d,TBS %d)\n",
 	    ue->Mod_id,dlsch0->rnti,
 	    harq_pid,frame_rx,subframe_rx,
 	    dlsch0->harq_processes[harq_pid]->rvidx,
@@ -3509,7 +3603,7 @@ void ue_dlsch_procedures(PHY_VARS_UE *ue,
     } else {
         if(dlsch0->rnti != 0xffff)
         {
-      LOG_D(PHY,"[UE  %d][PDSCH %x/%d] Frame %d subframe %d: Received DLSCH (rv %d,mcs %d,TBS %d)\n",
+      LOG_D(PHY,"[UE  %d][PDSCH %x/%d] AbsSubframe %d.%d : Received DLSCH CW0 (rv %d,mcs %d,TBS %d)\n",
 	    ue->Mod_id,dlsch0->rnti,
 	    harq_pid,frame_rx,subframe_rx,
 	    dlsch0->harq_processes[harq_pid]->rvidx,
@@ -3574,6 +3668,49 @@ void ue_dlsch_procedures(PHY_VARS_UE *ue,
 	dlsch0->harq_processes[dlsch0->current_harq_pid]->TBS;
     }
   
+    // Check CRC for CW 1
+    if(is_cw1_active)
+    {
+        if (ret1 == (1+dlsch0->max_turbo_iterations)) {
+            LOG_D(PHY,"[UE  %d][PDSCH %x/%d] Frame %d subframe %d DLSCH CW1 in error (rv %d,mcs %d,TBS %d)\n",
+                    ue->Mod_id,dlsch0->rnti,
+                    harq_pid,frame_rx,subframe_rx,
+                    dlsch0->harq_processes[harq_pid]->rvidx,
+                    dlsch0->harq_processes[harq_pid]->mcs,
+                    dlsch0->harq_processes[harq_pid]->TBS);
+
+        } else {
+            LOG_D(PHY,"[UE  %d][PDSCH %x/%d] Frame %d subframe %d: Received DLSCH CW1 (rv %d,mcs %d,TBS %d)\n",
+                    ue->Mod_id,dlsch0->rnti,
+                    harq_pid,frame_rx,subframe_rx,
+                    dlsch0->harq_processes[harq_pid]->rvidx,
+                    dlsch0->harq_processes[harq_pid]->mcs,
+                    dlsch0->harq_processes[harq_pid]->TBS);
+
+
+            if (ue->mac_enabled == 1) {
+                switch (pdsch) {
+                case PDSCH:
+                    if(is_cw1_active)
+                        mac_xface->ue_send_sdu(ue->Mod_id,
+                                CC_id,
+                                frame_rx,
+                                subframe_rx,
+                                dlsch1->harq_processes[dlsch1->current_harq_pid]->b,
+                                dlsch1->harq_processes[dlsch1->current_harq_pid]->TBS>>3,
+                                eNB_id);
+                    break;
+                case SI_PDSCH:
+                case P_PDSCH:
+                case RA_PDSCH:
+                case PDSCH1:
+                case PMCH:
+                    AssertFatal(0,"exiting");
+                    break;
+                }
+            }
+        }
+    }
   
       
 #ifdef DEBUG_PHY_PROC
@@ -3622,16 +3759,18 @@ int phy_procedures_UE_RX(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
     T_BUFFER(&ue->common_vars.rxdata[0][subframe_rx*ue->frame_parms.samples_per_tti],
              ue->frame_parms.samples_per_tti * 4));
 
-  start_meas(&ue->phy_proc_rx);
+  // start timers
+  start_meas(&ue->phy_proc_rx[subframe_rx&0x1]);
+  start_meas(&ue->generic_stat);
 
   pmch_flag = is_pmch_subframe(frame_rx,subframe_rx,&ue->frame_parms) ? 1 : 0;
 
 
   // deactivate reception until we scan pdcch
-  if (ue->dlsch[eNB_id][0])
-    ue->dlsch[eNB_id][0]->active = 0;
-  if (ue->dlsch[eNB_id][1])
-    ue->dlsch[eNB_id][1]->active = 0;
+  if (ue->dlsch[subframe_rx&0x1][eNB_id][0])
+    ue->dlsch[subframe_rx&0x1][eNB_id][0]->active = 0;
+  if (ue->dlsch[subframe_rx&0x1][eNB_id][1])
+    ue->dlsch[subframe_rx&0x1][eNB_id][1]->active = 0;
 
   if (ue->dlsch_SI[eNB_id])
     ue->dlsch_SI[eNB_id]->active = 0;
@@ -3717,15 +3856,18 @@ int phy_procedures_UE_RX(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
 	   0);
 
   // first slot has been processed (FFTs + Channel Estimation, PCFICH/PHICH/PDCCH)
- 
+  stop_meas(&ue->generic_stat);
+  //printf("[SFN %d] Slot0: FFT + Channel Estimate + PCFICH/PHICH/PDCCH %5.2f \n",subframe_rx,ue->generic_stat.p_time/(cpuf*1000.0));
+
+  start_meas(&ue->generic_stat);
   // do procedures for C-RNTI
-  if (ue->dlsch[eNB_id][0]->active == 1) {
+  if (ue->dlsch[subframe_rx&0x1][eNB_id][0]->active == 1) {
     VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PDSCH_PROC, VCD_FUNCTION_IN);
     ue_pdsch_procedures(ue,
 			proc,
 			eNB_id,
 			PDSCH,
-			ue->dlsch[eNB_id][0],
+			ue->dlsch[subframe_rx&0x1][eNB_id][0],
 			NULL,
 			ue->pdcch_vars[eNB_id]->num_pdcch_symbols,
 			ue->frame_parms.symbols_per_tti>>1,
@@ -3809,35 +3951,50 @@ int phy_procedures_UE_RX(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
     }
   } // not an S-subframe
 
+  stop_meas(&ue->generic_stat);
+  //printf("[SFN %d] Slot1: FFT + Channel Estimate + Pdsch Proc Slot0 %5.2f \n",subframe_rx,ue->generic_stat.p_time/(cpuf*1000.0));
+
   // run pbch procedures if subframe is 0
-  if (subframe_rx == 0)
+  if ( (subframe_rx == 0) && (ue->decode_MIB == 1))
+  {
     ue_pbch_procedures(eNB_id,ue,proc,abstraction_flag);
+  }
    
   // do procedures for C-RNTI
-  if (ue->dlsch[eNB_id][0]->active == 1) {
+  if (ue->dlsch[subframe_rx&0x1][eNB_id][0]->active == 1) {
     VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PDSCH_PROC, VCD_FUNCTION_IN);
+    start_meas(&ue->pdsch_procedures_stat);
     ue_pdsch_procedures(ue,
 			proc,
 			eNB_id,
 			PDSCH,
-			ue->dlsch[eNB_id][0],
+			ue->dlsch[subframe_rx&0x1][eNB_id][0],
 			NULL,
 			1+(ue->frame_parms.symbols_per_tti>>1),
 			ue->frame_parms.symbols_per_tti-1,
 			abstraction_flag);
+    stop_meas(&ue->pdsch_procedures_stat);
+
+    start_meas(&ue->dlsch_procedures_stat);
     ue_dlsch_procedures(ue,
 			proc,
 			eNB_id,
 			PDSCH,
-			ue->dlsch[eNB_id][0],
-			NULL,
+			ue->dlsch[subframe_rx&0x1][eNB_id][0],
+			ue->dlsch[subframe_rx&0x1][eNB_id][1],
 			&ue->dlsch_errors[eNB_id],
 			mode,
 			abstraction_flag);
+    stop_meas(&ue->dlsch_procedures_stat);
+    //printf("[SFN %d] Slot1:       Pdsch Proc %5.2f\n",subframe_rx,ue->pdsch_procedures_stat.p_time/(cpuf*1000.0));
+    //printf("[SFN %d] Slot0 Slot1: Dlsch Proc %5.2f\n",subframe_rx,ue->dlsch_procedures_stat.p_time/(cpuf*1000.0));
+
     VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PDSCH_PROC, VCD_FUNCTION_OUT);
 
   }
 
+  start_meas(&ue->generic_stat);
+
   // do procedures for SI-RNTI
   if ((ue->dlsch_SI[eNB_id]) && (ue->dlsch_SI[eNB_id]->active == 1)) {
     ue_pdsch_procedures(ue,
@@ -3932,7 +4089,8 @@ int phy_procedures_UE_RX(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
 
   }
 
-
+  stop_meas(&ue->generic_stat);
+  //printf("after tubo until end of Rx %5.2f \n",ue->generic_stat.p_time/(cpuf*1000.0));
 
 #ifdef EMOS
   phy_procedures_emos_UE_RX(ue,slot,eNB_id);
@@ -3940,7 +4098,10 @@ int phy_procedures_UE_RX(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t eNB_id,uin
 
      
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_UE_RX, VCD_FUNCTION_OUT);
-  stop_meas(&ue->phy_proc_rx);
+  stop_meas(&ue->phy_proc_rx[subframe_rx&0x1]);
+
+  //printf("------FULL RX PROC [SFN %d]: %5.2f ------\n",subframe_rx,ue->phy_proc_rx[subframe_rx&0x1].p_time/(cpuf*1000.0));
+
   return (0);
 }
    
diff --git a/openair1/SCHED/pucch_pc.c b/openair1/SCHED/pucch_pc.c
index 5bc03bb58de4f8e0653410e68e091d05cb6780dc..ae831238622a91fef15d58e9be8d7d0f65ae28a5 100644
--- a/openair1/SCHED/pucch_pc.c
+++ b/openair1/SCHED/pucch_pc.c
@@ -51,7 +51,7 @@ int16_t pucch_power_cntl(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t subframe,u
 
   Po_PUCCH = get_PL(ue->Mod_id,ue->CC_id,eNB_id)+
     ue->frame_parms.ul_power_control_config_common.p0_NominalPUCCH+
-    ue->dlsch[eNB_id][0]->g_pucch;
+    ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->g_pucch;
 
   switch (pucch_fmt) {
   case pucch_format1:
@@ -90,19 +90,19 @@ int16_t pucch_power_cntl(PHY_VARS_UE *ue,UE_rxtx_proc_t *proc,uint8_t subframe,u
   if (pucch_fmt!=pucch_format1) {
     LOG_D(PHY,"[UE  %d][PDSCH %x] AbsSubframe %d.%d: Po_PUCCH %d dBm : Po_NOMINAL_PUCCH %d dBm, PL %d dB, g_pucch %d dB\n",
           ue->Mod_id,
-          ue->dlsch[eNB_id][0]->rnti,proc->frame_tx%1024,subframe,
+          ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti,proc->frame_tx%1024,subframe,
           Po_PUCCH,
           ue->frame_parms.ul_power_control_config_common.p0_NominalPUCCH,
           get_PL(ue->Mod_id,ue->CC_id,eNB_id),
-          ue->dlsch[eNB_id][0]->g_pucch);
+          ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->g_pucch);
   } else {
     LOG_D(PHY,"[UE  %d][SR %x] AbsSubframe %d.%d: Po_PUCCH %d dBm : Po_NOMINAL_PUCCH %d dBm, PL %d dB g_pucch %d dB\n",
           ue->Mod_id,
-          ue->dlsch[eNB_id][0]->rnti,proc->frame_tx%1024,subframe,
+          ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->rnti,proc->frame_tx%1024,subframe,
           Po_PUCCH,
           ue->frame_parms.ul_power_control_config_common.p0_NominalPUCCH,
           get_PL(ue->Mod_id,ue->CC_id,eNB_id),
-          ue->dlsch[eNB_id][0]->g_pucch);
+          ue->dlsch[proc->subframe_rx&0x1][eNB_id][0]->g_pucch);
   }
 
   return(Po_PUCCH);
diff --git a/openair3/NAS/TOOLS/ue_tcl_test.conf b/openair3/NAS/TOOLS/ue_tcl_test.conf
new file mode 100644
index 0000000000000000000000000000000000000000..05db6be4d12130603fec8d272bf102158d8f23a4
--- /dev/null
+++ b/openair3/NAS/TOOLS/ue_tcl_test.conf
@@ -0,0 +1,114 @@
+# List of known PLMNS
+PLMN: {
+    PLMN0: {
+           FULLNAME="Test network";
+           SHORTNAME="OAI4G";
+           MNC="01";
+           MCC="001";
+
+    };
+    PLMN1: {
+           FULLNAME="SFR France";
+           SHORTNAME="SFR";
+           MNC="10";
+           MCC="208";
+
+    };
+    PLMN2: {
+           FULLNAME="SFR France";
+           SHORTNAME="SFR";
+           MNC="11";
+           MCC="208";
+    };
+    PLMN3: {
+           FULLNAME="SFR France";
+           SHORTNAME="SFR";
+           MNC="13";
+           MCC="208";
+    };
+    PLMN4: {
+           FULLNAME="OAI LTEBOX";
+           SHORTNAME="OAIALU";
+           MNC="93";
+           MCC="208";
+    };
+    PLMN5: {
+           FULLNAME="T-Mobile USA";
+           SHORTNAME="T-Mobile";
+           MNC="280";
+           MCC="310";
+    };
+    PLMN6: {
+           FULLNAME="FICTITIOUS USA";
+           SHORTNAME="FICTITIO";
+           MNC="028";
+           MCC="310";
+    };
+    PLMN7: {
+           FULLNAME="Vodafone Italia";
+           SHORTNAME="VODAFONE";
+           MNC="10";
+           MCC="222";
+    };
+    PLMN8: {
+           FULLNAME="Vodafone Spain";
+           SHORTNAME="VODAFONE";
+           MNC="01";
+           MCC="214";
+    };
+    PLMN9: {
+           FULLNAME="Vodafone Spain";
+           SHORTNAME="VODAFONE";
+           MNC="06";
+           MCC="214";
+    };
+    PLMN10: {
+           FULLNAME="Vodafone Germ";
+           SHORTNAME="VODAFONE";
+           MNC="02";
+           MCC="262";
+    };
+    PLMN11: {
+           FULLNAME="Vodafone Germ";
+           SHORTNAME="VODAFONE";
+           MNC="04";
+           MCC="262";
+    };
+};
+
+UE0:
+{
+    USER: {
+        IMEI="356113022094149";
+        MANUFACTURER="EURECOM";
+        MODEL="LTE Android PC";
+        PIN="0000";
+    };
+
+    SIM: {
+        MSIN="000001234";
+        USIM_API_K="000102030405060708090A0B0C0D0E0F";
+        OPC="C42449363BBAD02B66D16BC975D77CC1";
+        MSISDN="000000000000";//"33611123456";
+    };
+
+    # Home PLMN Selector with Access Technology
+    HPLMN= "00101";
+
+    # User controlled PLMN Selector with Access Technology
+    UCPLMN_LIST = ();
+
+    # Operator PLMN List
+    OPLMN_LIST = ("00101", "20810", "20811", "20813", "20893", "310280", "310028");
+
+    # Operator controlled PLMN Selector with Access Technology
+    OCPLMN_LIST = ("22210", "21401", "21406", "26202", "26204");
+
+    # Forbidden plmns
+    FPLMN_LIST = ();
+
+    # List of Equivalent HPLMNs
+#TODO: UE does not connect if set, to be fixed in the UE
+#    EHPLMN_LIST= ("20811", "20813");
+    EHPLMN_LIST= ();
+};
diff --git a/targets/RT/USER/lte-softmodem.c b/targets/RT/USER/lte-softmodem.c
index 8303d88218fbe0fbf83ab827133e3b29d355f5de..5761577259eb8365881dacd72c994a6707b788e4 100644
--- a/targets/RT/USER/lte-softmodem.c
+++ b/targets/RT/USER/lte-softmodem.c
@@ -151,6 +151,8 @@ uint8_t usim_test = 0;
 uint8_t nb_antenna_tx = 1;
 uint8_t nb_antenna_rx = 1;
 
+int16_t dlsch_demod_shift = 0;
+
 char ref[128] = "internal";
 char channels[128] = "0";
 
@@ -635,6 +637,7 @@ static void get_options (int argc, char **argv) {
         LONG_OPTION_THREADIQ,
         LONG_OPTION_THREADODDSUBFRAME,
         LONG_OPTION_THREADEVENSUBFRAME,
+        LONG_OPTION_DEMOD_SHIFT,
 #if T_TRACER
         LONG_OPTION_T_PORT,
         LONG_OPTION_T_NOWAIT,
@@ -670,6 +673,7 @@ static void get_options (int argc, char **argv) {
         {"threadIQ",  required_argument, NULL, LONG_OPTION_THREADIQ},
         {"threadOddSubframe",  required_argument, NULL, LONG_OPTION_THREADODDSUBFRAME},
         {"threadEvenSubframe",  required_argument, NULL, LONG_OPTION_THREADEVENSUBFRAME},
+        {"dlsch-demod-shift", required_argument,  NULL, LONG_OPTION_DEMOD_SHIFT},
 #if T_TRACER
         {"T_port",                 required_argument, 0, LONG_OPTION_T_PORT},
         {"T_nowait",               no_argument,       0, LONG_OPTION_T_NOWAIT},
@@ -800,7 +804,9 @@ static void get_options (int argc, char **argv) {
     case LONG_OPTION_THREADEVENSUBFRAME:
        threads.even=atoi(optarg);
        break;
-
+    case LONG_OPTION_DEMOD_SHIFT:
+        dlsch_demod_shift = atof(optarg);
+        break;
 #if T_TRACER
         case LONG_OPTION_T_PORT: {
             extern int T_port;
diff --git a/targets/RT/USER/lte-ue.c b/targets/RT/USER/lte-ue.c
index 3b6ea9e5c9166b098675de288347bc34f6405e15..49093ed2def4f22334c7870ff358abc938ce70f3 100644
--- a/targets/RT/USER/lte-ue.c
+++ b/targets/RT/USER/lte-ue.c
@@ -56,6 +56,8 @@
 
 #include "T.h"
 
+extern double cpuf;
+
 #define FRAME_PERIOD    100000000ULL
 #define DAQ_PERIOD      66667ULL
 #define FIFO_PRIORITY   40
@@ -538,6 +540,9 @@ static void *UE_thread_rxn_txnp4(void *arg) {
             }
             phy_procedures_UE_RX( UE, proc, 0, 0, UE->mode, no_relay, NULL );
         }
+
+        start_meas(&UE->generic_stat);
+
         if (UE->mac_enabled==1) {
 
             ret = mac_xface->ue_scheduler(UE->Mod_id,
@@ -567,6 +572,9 @@ static void *UE_thread_rxn_txnp4(void *arg) {
                        UE->Mod_id, proc->frame_rx, proc->subframe_tx,txt );
             }
         }
+
+        stop_meas(&UE->generic_stat);
+
         // Prepare the future Tx data
 
         if ((subframe_select( &UE->frame_parms, proc->subframe_tx) == SF_UL) ||
diff --git a/targets/SIMU/USER/init_lte.c b/targets/SIMU/USER/init_lte.c
index 8ff274be14d14c9e618ba6d9f8b7b22709109a3a..a99ccf2c09f40866ec31acd82fc74c65886e758e 100644
--- a/targets/SIMU/USER/init_lte.c
+++ b/targets/SIMU/USER/init_lte.c
@@ -161,30 +161,32 @@ PHY_VARS_UE* init_lte_UE(LTE_DL_FRAME_PARMS *frame_parms,
   memcpy(&(PHY_vars_UE->frame_parms), frame_parms, sizeof(LTE_DL_FRAME_PARMS));
   phy_init_lte_ue(PHY_vars_UE,1,abstraction_flag);
 
-  for (i=0; i<NUMBER_OF_CONNECTED_eNB_MAX; i++) {
-    for (j=0; j<2; j++) {
-      PHY_vars_UE->dlsch[i][j]  = new_ue_dlsch(1,NUMBER_OF_HARQ_PID_MAX,NSOFT,MAX_TURBO_ITERATIONS,frame_parms->N_RB_DL, abstraction_flag);
+  for (int l=0; l<2; l++) {
+      for (i=0; i<NUMBER_OF_CONNECTED_eNB_MAX; i++) {
+          for (j=0; j<2; j++) {
+              PHY_vars_UE->dlsch[l][i][j]  = new_ue_dlsch(1,NUMBER_OF_HARQ_PID_MAX,NSOFT,MAX_TURBO_ITERATIONS,frame_parms->N_RB_DL, abstraction_flag);
 
-      if (!PHY_vars_UE->dlsch[i][j]) {
-        LOG_E(PHY,"Can't get ue dlsch structures\n");
-        exit(-1);
-      } else
-        LOG_D(PHY,"dlsch[%d][%d] => %p\n",UE_id,i,PHY_vars_UE->dlsch[i][j]);
-    }
+              if (!PHY_vars_UE->dlsch[l][i][j]) {
+                  LOG_E(PHY,"Can't get ue dlsch structures\n");
+                  exit(-1);
+              } else
+                  LOG_D(PHY,"dlsch[%d][%d] => %p\n",UE_id,i,PHY_vars_UE->dlsch[l][i][j]);
+          }
 
 
 
-    PHY_vars_UE->ulsch[i]  = new_ue_ulsch(frame_parms->N_RB_UL, abstraction_flag);
+          PHY_vars_UE->ulsch[i]  = new_ue_ulsch(frame_parms->N_RB_UL, abstraction_flag);
 
-    if (!PHY_vars_UE->ulsch[i]) {
-      LOG_E(PHY,"Can't get ue ulsch structures\n");
-      exit(-1);
-    }
+          if (!PHY_vars_UE->ulsch[i]) {
+              LOG_E(PHY,"Can't get ue ulsch structures\n");
+              exit(-1);
+          }
 
-    PHY_vars_UE->dlsch_SI[i]  = new_ue_dlsch(1,1,NSOFT,MAX_TURBO_ITERATIONS,frame_parms->N_RB_DL, abstraction_flag);
-    PHY_vars_UE->dlsch_ra[i]  = new_ue_dlsch(1,1,NSOFT,MAX_TURBO_ITERATIONS,frame_parms->N_RB_DL, abstraction_flag);
+          PHY_vars_UE->dlsch_SI[i]  = new_ue_dlsch(1,1,NSOFT,MAX_TURBO_ITERATIONS,frame_parms->N_RB_DL, abstraction_flag);
+          PHY_vars_UE->dlsch_ra[i]  = new_ue_dlsch(1,1,NSOFT,MAX_TURBO_ITERATIONS,frame_parms->N_RB_DL, abstraction_flag);
 
-    PHY_vars_UE->transmission_mode[i] = frame_parms->nb_antenna_ports_eNB==1 ? 1 : 2;
+          PHY_vars_UE->transmission_mode[i] = frame_parms->nb_antenna_ports_eNB==1 ? 1 : 2;
+      }
   }
 
   PHY_vars_UE->frame_parms.pucch_config_common.deltaPUCCH_Shift = 1;