From 4573c0b962aedf2b2d574a05823a263aafff33c9 Mon Sep 17 00:00:00 2001
From: lukashov <elena.lukashova@eurecom.fr>
Date: Wed, 16 Mar 2016 16:05:09 +0100
Subject: [PATCH] Changes for SIC-receiver: 1. in dlsim.c: now we check for the
 receiver type and, if SIC,  proceed through encoding, modulation, LLR
 compuattion and decoding of the decoded TB0. 2. Implementing routines for
 vector-vector multiplication and vector-vector subtraction 3. Adding some
 printouts for debuging purposes. 4. New functions: modulation_SIC,
 dlsch_qpsk_llr_SIC 5. In phy_scope.c: changing llr plotting: now for the
 length of coded_bits_per_cw.

---
 cmake_targets/CMakeLists.txt                  |   3 +-
 openair1/PHY/LTE_TRANSPORT/defs.h             |   1 +
 .../PHY/LTE_TRANSPORT/dlsch_demodulation.c    |  41 +-
 .../PHY/LTE_TRANSPORT/dlsch_llr_computation.c | 359 ++++++++++++++++++
 openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c | 162 +++++++-
 openair1/PHY/LTE_TRANSPORT/power_control.c    |   3 +-
 openair1/PHY/LTE_TRANSPORT/proto.h            |  43 ++-
 openair1/PHY/TOOLS/cadd_sv.c                  |   2 +-
 openair1/PHY/TOOLS/cadd_vv.c                  |  33 +-
 openair1/PHY/TOOLS/cmult_sv.c                 |  10 +-
 openair1/PHY/TOOLS/cmult_vv.c                 |  78 +++-
 openair1/PHY/TOOLS/defs.h                     |  11 +
 openair1/PHY/TOOLS/lte_phy_scope.c            |  16 +-
 openair1/SCHED/phy_procedures_lte_eNb.c       |   1 -
 openair1/SIMULATION/LTE_PHY/dlsim.c           | 346 ++++++++++++++---
 15 files changed, 1016 insertions(+), 93 deletions(-)

diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt
index 92505b9ef6..77d8703d95 100644
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -123,7 +123,7 @@ endmacro(add_list_string_option)
 ####################################################
 # compilation flags
 #############################################
-#set (CMAKE_BUILD_TYPE "Debug")
+set (CMAKE_BUILD_TYPE "Debug")
 if (CMAKE_BUILD_TYPE STREQUAL "")
    set(CMAKE_BUILD_TYPE "RelWithDebInfo")
 endif()
@@ -934,6 +934,7 @@ set(PHY_SRC
   ${OPENAIR1_DIR}/PHY/INIT/lte_parms.c
   ${OPENAIR1_DIR}/PHY/INIT/lte_param_init.c
   ${OPENAIR1_DIR}/PHY/TOOLS/file_output.c
+  ${OPENAIR1_DIR}/PHY/TOOLS/cadd_vv.c
   ${OPENAIR1_DIR}/PHY/TOOLS/lte_dfts.c
   ${OPENAIR1_DIR}/PHY/TOOLS/log2_approx.c
   ${OPENAIR1_DIR}/PHY/TOOLS/cmult_sv.c
diff --git a/openair1/PHY/LTE_TRANSPORT/defs.h b/openair1/PHY/LTE_TRANSPORT/defs.h
index a05b16d12a..bb38628669 100644
--- a/openair1/PHY/LTE_TRANSPORT/defs.h
+++ b/openair1/PHY/LTE_TRANSPORT/defs.h
@@ -742,6 +742,7 @@ typedef enum {
   rx_standard=0,
   rx_IC_single_stream,
   rx_IC_dual_stream,
+  rx_SIC_dual_stream
 } RX_type_t;
 
 typedef enum {
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
index be1648f3eb..4ac64e11e6 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
@@ -220,7 +220,7 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
        printf("\n");*/
 //#endif
 
-   if (rx_type==rx_IC_single_stream) {
+   if (rx_type >= rx_IC_single_stream) {
       if (eNB_id_i<phy_vars_ue->n_connected_eNB) // we are in TM5
        nb_rb = dlsch_extract_rbs_dual(lte_ue_common_vars->rxdataF,
                                        lte_ue_common_vars->dl_ch_estimates[eNB_id_i],
@@ -348,10 +348,10 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
                                nb_rb,
                                lte_ue_pdsch_vars[eNB_id]->log2_maxh,
                                phy_measurements); // log2_maxh+I0_shift
- /*
+ 
  if (symbol == 5) {
      write_output("rxF_comp_d.m","rxF_c_d",&lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0[0][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);
- } */
+ } 
     if ((rx_type==rx_IC_single_stream) && 
         (eNB_id_i<phy_vars_ue->n_connected_eNB)) {
          
@@ -452,8 +452,8 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
                                      nb_rb, 
                                      lte_ue_pdsch_vars[eNB_id]->log2_maxh0,
 				     lte_ue_pdsch_vars[eNB_id]->log2_maxh1); 
-   	/*   
-       if (symbol == 5) {
+   	   
+      if (symbol == 5) {
    
      write_output("rxF_comp_d00.m","rxF_c_d00",&lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0[0][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);// should be QAM
      write_output("rxF_comp_d01.m","rxF_c_d01",&lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0[1][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);//should be almost 0
@@ -461,7 +461,8 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
      write_output("rxF_comp_d11.m","rxF_c_d11",&lte_ue_pdsch_vars[eNB_id]->rxdataF_comp1[harq_pid][round][1][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);//should be QAM
 
    
-	}*/
+	} 
+
       // compute correlation between signal and interference channels (rho12 and rho21)
       
 	dlsch_dual_stream_correlation(frame_parms,// this is doing h0'h1, needed for llr[1]
@@ -485,6 +486,16 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
 				    lte_ue_pdsch_vars[eNB_id]->log2_maxh1);
     //  printf("rho stream2 =%d\n",&lte_ue_pdsch_vars[eNB_id]->dl_ch_rho2_ext );
       //printf("TM3 log2_maxh : %d\n",lte_ue_pdsch_vars[eNB_id]->log2_maxh);
+      
+       if (symbol == 5) {
+   
+     write_output("rho0_0.m","rho0_0",&lte_ue_pdsch_vars[eNB_id]->dl_ch_rho_ext[harq_pid][round][0][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);// should be QAM
+     write_output("rho2_0.m","rho2_0",&lte_ue_pdsch_vars[eNB_id]->dl_ch_rho2_ext[0][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);//should be almost 0
+     write_output("rho0_1.m.m","rho0_1",&lte_ue_pdsch_vars[eNB_id]->dl_ch_rho_ext[harq_pid][round][1][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);//should be almost 0
+     write_output("rho2_1.m","rho2_1",&lte_ue_pdsch_vars[eNB_id]->dl_ch_rho2_ext[1][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);//should be QAM
+
+   
+	} 
 
    }
       else {
@@ -664,6 +675,15 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
 				 symbol,
 				 nb_rb,
 				 1);
+	
+	 
+       if (symbol == 5) {
+   
+     write_output("rho0_mrc.m","rho0_0",&lte_ue_pdsch_vars[eNB_id]->dl_ch_rho_ext[harq_pid][round][0][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);// should be QAM
+     write_output("rho2_mrc.m","rho2_0",&lte_ue_pdsch_vars[eNB_id]->dl_ch_rho2_ext[0][symbol*frame_parms->N_RB_DL*12],frame_parms->N_RB_DL*12,1,1);//should be almost 0
+    	}
+	
+	
       }
     } else {
 
@@ -728,7 +748,7 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
                        adjust_G2(frame_parms,dlsch0_harq->rb_alloc_even,2,subframe,symbol),
                        lte_ue_pdsch_vars[eNB_id]->llr128);
     }
-      else if ((rx_type==rx_IC_single_stream) || (rx_type==rx_IC_dual_stream)) {
+      else if (rx_type >= rx_IC_single_stream) {
         if (dlsch1_harq->Qm == 2) {
           dlsch_qpsk_qpsk_llr(frame_parms,
                               lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0,
@@ -4705,8 +4725,13 @@ void dump_dlsch2(PHY_VARS_UE *phy_vars_ue,uint8_t eNB_id,uint16_t coded_bits_per
   */
   sprintf(fname,"dlsch%d_r%d_rho.m",eNB_id,round);
   sprintf(vname,"dl_rho_r%d_%d",eNB_id,round);
-  write_output(fname,vname,phy_vars_ue->lte_ue_pdsch_vars[eNB_id]->dl_ch_rho_ext[0][0][0],12*N_RB_DL*nsymb,1,1);
+  write_output(fname,vname,phy_vars_ue->lte_ue_pdsch_vars[eNB_id]->dl_ch_rho_ext[harq_pid][round][0],12*N_RB_DL*nsymb,1,1);
 
+  sprintf(fname,"dlsch%d_r%d_rho2.m",eNB_id,round);
+  sprintf(vname,"dl_rho2_r%d_%d",eNB_id,round);
+  write_output(fname,vname,phy_vars_ue->lte_ue_pdsch_vars[eNB_id]->dl_ch_rho2_ext[0],12*N_RB_DL*nsymb,1,1);
+  
+  
   sprintf(fname,"dlsch%d_rxF_r%d_comp0.m",eNB_id,round);
   sprintf(vname,"dl%d_rxF_r%d_comp0",eNB_id,round);
   write_output(fname,vname,phy_vars_ue->lte_ue_pdsch_vars[eNB_id]->rxdataF_comp0[0],12*N_RB_DL*nsymb,1,1);
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
index a4211ad3d6..78590b3017 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
@@ -39,6 +39,7 @@
  */
 
 #include "PHY/defs.h"
+#include "PHY/TOOLS/defs.h"
 #include "PHY/extern.h"
 #include "defs.h"
 #include "extern.h"
@@ -684,6 +685,99 @@ int dlsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
   return(0);
 }
 
+int32_t dlsch_qpsk_llr_SIC(LTE_DL_FRAME_PARMS *frame_parms,
+                       int **rxdataF_comp,
+		       mod_sym_t **sic_buffer,
+		       int **rho_i,
+                       short *dlsch_llr,
+		       uint8_t num_pdcch_symbols,
+                       uint16_t nb_rb,
+                       uint16_t pbch_pss_sss_adjust,
+		       LTE_UE_DLSCH_t *dlsch0)
+{
+
+  int16_t rho_amp_x0[2*frame_parms->N_RB_DL*12];
+  int16_t rho_rho_amp_x0[2*frame_parms->N_RB_DL*12];
+  int16_t clean_x1[2*frame_parms->N_RB_DL*12];
+  uint16_t amp_tmp;
+  uint16_t *llr16=(uint16_t*)dlsch_llr;
+  int i, len,  nsymb;
+  uint8_t symbol, symbol_mod;
+  //uint8_t pilots;
+  int len_acc=0;
+  uint16_t *sic_data;
+
+  nsymb = (frame_parms->Ncp==0) ? 14:12;
+
+  for (symbol=num_pdcch_symbols; symbol<nsymb; symbol++) {
+    uint16_t *rxF = (uint16_t*)&rxdataF_comp[0][((int16_t)symbol*frame_parms->N_RB_DL*12)];
+    int16_t *rho_1=(int16_t*)&rho_i[0][((int16_t)symbol*frame_parms->N_RB_DL*12)];
+    sic_data = (uint16_t*)&sic_buffer[0][((int16_t)len_acc)]; 
+
+    symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
+
+    if ((symbol_mod == 0) || (symbol_mod == (4-frame_parms->Ncp))) //pilots=1
+      amp_tmp=dlsch0->sqrt_rho_b;                                   
+    else //pilots=0
+      amp_tmp=dlsch0->sqrt_rho_a;
+
+  // printf("amp_tmp=%d\n", amp_tmp);
+    
+    if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
+      if (frame_parms->mode1_flag==0)
+        len = (nb_rb*8) - (2*pbch_pss_sss_adjust/3);
+      else
+        len = (nb_rb*10) - (5*pbch_pss_sss_adjust/6);
+    } else {
+      len = (nb_rb*12) - pbch_pss_sss_adjust;
+    }
+
+    len_acc+=len; //accumulated length; this is done because in sic_buffer we have only data symbols
+ 
+    multadd_complex_vector_real_scalar((int16_t *)sic_data,
+					amp_tmp,
+				        (int16_t *)rho_amp_x0,
+				        1,
+				        len);	
+    
+    printf ("Got x0*rho_a\n");
+   
+    mult_cpx_vector((int16_t *)rho_amp_x0,
+                    (int16_t *)rho_1,
+                    (int16_t*)rho_rho_amp_x0,
+                    len,
+                    13);
+
+    write_output("rho_rho.m","rho2", rho_rho_amp_x0,len,1,1);
+    printf ("Computed  rho*rho_a*x0\n");
+  
+    sub_cpx_vector16((int16_t *)rxF,
+		     (int16_t *)rho_rho_amp_x0,
+		     //(int16_t *)clean_x1,
+		     (int16_t *)rxF,
+		     len*2); 
+
+    write_output("clean_x1.m","x1", clean_x1,len,1,1);
+    printf ("Interference removed \n");
+
+    printf("dlsch_qpsk_llr_SIC: symbol %d,nb_rb %d, len %d,pbch_pss_sss_adjust %d\n",symbol,nb_rb,len,pbch_pss_sss_adjust);
+
+ 
+    //this is for QPSK only!!!
+    for (i=0; i<len*2; i++) {
+      *llr16 = rxF[i];
+      //printf("llr %d : (%d,%d)\n",i,((int16_t*)llr32)[0],((int16_t*)llr32)[1]);
+      llr16++;
+    }
+    
+  }
+
+  printf("dlsch_qpsk_llr_SIC: acc_len=%d\n",len_acc);
+
+  return(0);
+}
+
+
 //----------------------------------------------------------------------------------------------
 // 16-QAM
 //----------------------------------------------------------------------------------------------
@@ -716,6 +810,124 @@ void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
   unsigned char symbol_mod,len_mod4=0;
 
 
+#if defined(__x86_64__) || defined(__i386__)
+  if (first_symbol_flag==1) {
+    llr32 = (uint32_t*)dlsch_llr;
+  } else {
+    llr32 = (uint32_t*)*llr32p;
+  }
+#elif defined(__arm__)
+  if (first_symbol_flag==1) {
+    llr16 = (int16_t*)dlsch_llr;
+  } else {
+    llr16 = (int16_t*)*llr32p;
+  }
+#endif
+
+  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
+
+#if defined(__x86_64__) || defined(__i386__)
+  ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+#elif defined(__arm__)
+  ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+#endif
+  if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
+    if (frame_parms->mode1_flag==0)
+      len = nb_rb*8 - (2*pbch_pss_sss_adjust/3);
+    else
+      len = nb_rb*10 - (5*pbch_pss_sss_adjust/6);
+  } else {
+    len = nb_rb*12 - pbch_pss_sss_adjust;
+  }
+
+  // update output pointer according to number of REs in this symbol (<<2 because 4 bits per RE)
+  if (first_symbol_flag == 1)
+    *llr32p = dlsch_llr + (len<<2);
+  else
+    *llr32p += (len<<2);
+
+  len_mod4 = len&3;
+  len>>=2;  // length in quad words (4 REs)
+  len+=(len_mod4==0 ? 0 : 1);
+
+  for (i=0; i<len; i++) {
+
+#if defined(__x86_64__) || defined(__i386)
+    xmm0 = _mm_abs_epi16(rxF[i]);
+    xmm0 = _mm_subs_epi16(ch_mag[i],xmm0);
+
+    // lambda_1=y_R, lambda_2=|y_R|-|h|^2, lamda_3=y_I, lambda_4=|y_I|-|h|^2
+    llr128[0] = _mm_unpacklo_epi32(rxF[i],xmm0);
+    llr128[1] = _mm_unpackhi_epi32(rxF[i],xmm0);
+    llr32[0] = _mm_extract_epi32(llr128[0],0); //((uint32_t *)&llr128[0])[0];
+    llr32[1] = _mm_extract_epi32(llr128[0],1); //((uint32_t *)&llr128[0])[1];
+    llr32[2] = _mm_extract_epi32(llr128[0],2); //((uint32_t *)&llr128[0])[2];
+    llr32[3] = _mm_extract_epi32(llr128[0],3); //((uint32_t *)&llr128[0])[3];
+    llr32[4] = _mm_extract_epi32(llr128[1],0); //((uint32_t *)&llr128[1])[0];
+    llr32[5] = _mm_extract_epi32(llr128[1],1); //((uint32_t *)&llr128[1])[1];
+    llr32[6] = _mm_extract_epi32(llr128[1],2); //((uint32_t *)&llr128[1])[2];
+    llr32[7] = _mm_extract_epi32(llr128[1],3); //((uint32_t *)&llr128[1])[3];
+    llr32+=8;
+#elif defined(__arm__)
+    xmm0 = vabsq_s16(rxF[i]);
+    xmm0 = vqsubq_s16(ch_mag[i],xmm0);
+    // lambda_1=y_R, lambda_2=|y_R|-|h|^2, lamda_3=y_I, lambda_4=|y_I|-|h|^2
+
+    llr16[0] = vgetq_lane_s16(rxF[i],0);
+    llr16[1] = vgetq_lane_s16(rxF[i],1);
+    llr16[2] = vgetq_lane_s16(xmm0,0);
+    llr16[3] = vgetq_lane_s16(xmm0,1);
+    llr16[4] = vgetq_lane_s16(rxF[i],2);
+    llr16[5] = vgetq_lane_s16(rxF[i],3);
+    llr16[6] = vgetq_lane_s16(xmm0,2);
+    llr16[7] = vgetq_lane_s16(xmm0,3);
+    llr16[8] = vgetq_lane_s16(rxF[i],4);
+    llr16[9] = vgetq_lane_s16(rxF[i],5);
+    llr16[10] = vgetq_lane_s16(xmm0,4);
+    llr16[11] = vgetq_lane_s16(xmm0,5);
+    llr16[12] = vgetq_lane_s16(rxF[i],6);
+    llr16[13] = vgetq_lane_s16(rxF[i],6);
+    llr16[14] = vgetq_lane_s16(xmm0,7);
+    llr16[15] = vgetq_lane_s16(xmm0,7);
+    llr16+=16;
+#endif
+
+  }
+
+#if defined(__x86_64__) || defined(__i386__)
+  _mm_empty();
+  _m_empty();
+#endif
+}
+
+void dlsch_16qam_llr_SIC  (LTE_DL_FRAME_PARMS *frame_parms,
+                          int32_t **rxdataF_comp,
+			  int16_t *dlsch_llr,
+			  int32_t **dl_ch_mag,
+			  uint8_t symbol,
+			  uint8_t first_symbol_flag,
+			  uint16_t nb_rb,
+			  uint16_t pbch_pss_sss_adjust,
+			  int16_t **llr32p)
+{
+
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  __m128i *ch_mag;
+  __m128i llr128[2];
+  uint32_t *llr32;
+#elif defined(__arm__)
+  int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16x8_t *ch_mag;
+  int16x8_t xmm0;
+  int16_t *llr16;
+#endif
+
+
+  int i,len;
+  unsigned char symbol_mod,len_mod4=0;
+
+
 #if defined(__x86_64__) || defined(__i386__)
   if (first_symbol_flag==1) {
     llr32 = (uint32_t*)dlsch_llr;
@@ -958,6 +1170,153 @@ void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
 }
 
 
+void dlsch_64qam_llr_SIC(LTE_DL_FRAME_PARMS *frame_parms,
+			  int32_t **rxdataF_comp,
+			  int16_t *dlsch_llr,
+			  int32_t **dl_ch_mag,
+			  int32_t **dl_ch_magb,
+			  uint8_t symbol,
+			  uint8_t first_symbol_flag,
+			  uint16_t nb_rb,
+			  uint16_t pbch_pss_sss_adjust,
+			  int16_t **llr_save)
+{
+#if defined(__x86_64__) || defined(__i386__)
+  __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  __m128i *ch_mag,*ch_magb;
+#elif defined(__arm__)
+  int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
+  int16x8_t *ch_mag,*ch_magb,xmm1,xmm2;
+#endif
+  int i,len,len2;
+  unsigned char symbol_mod,len_mod4;
+  short *llr;
+  int16_t *llr2;
+
+  if (first_symbol_flag==1)
+    llr = dlsch_llr;
+  else
+    llr = *llr_save;
+
+  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;
+
+#if defined(__x86_64__) || defined(__i386__)
+  ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+  ch_magb = (__m128i*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];
+#elif defined(__arm__)
+  ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
+  ch_magb = (int16x8_t*)&dl_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];
+#endif
+  if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
+    if (frame_parms->mode1_flag==0)
+      len = nb_rb*8 - (2*pbch_pss_sss_adjust/3);
+    else
+      len = nb_rb*10 - (5*pbch_pss_sss_adjust/6);
+  } else {
+    len = nb_rb*12 - pbch_pss_sss_adjust;
+  }
+
+  llr2 = llr;
+  llr += (len*6);
+
+  len_mod4 =len&3;
+  len2=len>>2;  // length in quad words (4 REs)
+  len2+=(len_mod4?0:1);
+
+  for (i=0; i<len2; i++) {
+
+#if defined(__x86_64__) || defined(__i386__)
+    xmm1 = _mm_abs_epi16(rxF[i]);
+    xmm1 = _mm_subs_epi16(ch_mag[i],xmm1);
+    xmm2 = _mm_abs_epi16(xmm1);
+    xmm2 = _mm_subs_epi16(ch_magb[i],xmm2);
+#elif defined(__arm__)
+    xmm1 = vabsq_s16(rxF[i]);
+    xmm1 = vsubq_s16(ch_mag[i],xmm1);
+    xmm2 = vabsq_s16(xmm1);
+    xmm2 = vsubq_s16(ch_magb[i],xmm2);
+#endif
+    // loop over all LLRs in quad word (24 coded bits)
+    /*
+      for (j=0;j<8;j+=2) {
+      llr2[0] = ((short *)&rxF[i])[j];
+      llr2[1] = ((short *)&rxF[i])[j+1];
+      llr2[2] = ((short *)&xmm1)[j];
+      llr2[3] = ((short *)&xmm1)[j+1];
+      llr2[4] = ((short *)&xmm2)[j];
+      llr2[5] = ((short *)&xmm2)[j+1];
+
+     llr2+=6;
+      }
+    */
+    llr2[0] = ((short *)&rxF[i])[0];
+    llr2[1] = ((short *)&rxF[i])[1];
+#if defined(__x86_64__) || defined(__i386__)
+    llr2[2] = _mm_extract_epi16(xmm1,0);
+    llr2[3] = _mm_extract_epi16(xmm1,1);//((short *)&xmm1)[j+1];
+    llr2[4] = _mm_extract_epi16(xmm2,0);//((short *)&xmm2)[j];
+    llr2[5] = _mm_extract_epi16(xmm2,1);//((short *)&xmm2)[j+1];
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,0);
+    llr2[3] = vgetq_lane_s16(xmm1,1);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,0);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,1);//((short *)&xmm2)[j+1];
+#endif
+
+    llr2+=6;
+    llr2[0] = ((short *)&rxF[i])[2];
+    llr2[1] = ((short *)&rxF[i])[3];
+#if defined(__x86_64__) || defined(__i386__)
+    llr2[2] = _mm_extract_epi16(xmm1,2);
+    llr2[3] = _mm_extract_epi16(xmm1,3);//((short *)&xmm1)[j+1];
+    llr2[4] = _mm_extract_epi16(xmm2,2);//((short *)&xmm2)[j];
+    llr2[5] = _mm_extract_epi16(xmm2,3);//((short *)&xmm2)[j+1];
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,2);
+    llr2[3] = vgetq_lane_s16(xmm1,3);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,2);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,3);//((short *)&xmm2)[j+1];
+#endif
+
+    llr2+=6;
+    llr2[0] = ((short *)&rxF[i])[4];
+    llr2[1] = ((short *)&rxF[i])[5];
+#if defined(__x86_64__) || defined(__i386__)
+    llr2[2] = _mm_extract_epi16(xmm1,4);
+    llr2[3] = _mm_extract_epi16(xmm1,5);//((short *)&xmm1)[j+1];
+    llr2[4] = _mm_extract_epi16(xmm2,4);//((short *)&xmm2)[j];
+    llr2[5] = _mm_extract_epi16(xmm2,5);//((short *)&xmm2)[j+1];
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,4);
+    llr2[3] = vgetq_lane_s16(xmm1,5);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,4);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,5);//((short *)&xmm2)[j+1];
+#endif
+    llr2+=6;
+    llr2[0] = ((short *)&rxF[i])[6];
+    llr2[1] = ((short *)&rxF[i])[7];
+#if defined(__x86_64__) || defined(__i386__)
+    llr2[2] = _mm_extract_epi16(xmm1,6);
+    llr2[3] = _mm_extract_epi16(xmm1,7);//((short *)&xmm1)[j+1];
+    llr2[4] = _mm_extract_epi16(xmm2,6);//((short *)&xmm2)[j];
+    llr2[5] = _mm_extract_epi16(xmm2,7);//((short *)&xmm2)[j+1];
+#elif defined(__arm__)
+    llr2[2] = vgetq_lane_s16(xmm1,6);
+    llr2[3] = vgetq_lane_s16(xmm1,7);//((short *)&xmm1)[j+1];
+    llr2[4] = vgetq_lane_s16(xmm2,6);//((short *)&xmm2)[j];
+    llr2[5] = vgetq_lane_s16(xmm2,7);//((short *)&xmm2)[j+1];
+#endif
+    llr2+=6;
+
+  }
+
+  *llr_save = llr;
+#if defined(__x86_64__) || defined(__i386__)
+  _mm_empty();
+  _m_empty();
+#endif
+}
+
 //==============================================================================================
 // DUAL-STREAM
 //==============================================================================================
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c
index 6718c8d8e4..4fd659186d 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c
@@ -1092,6 +1092,8 @@ int allocate_REs_in_RB(LTE_DL_FRAME_PARMS *frame_parms,
   return(0);
 }
 
+
+
 int allocate_REs_in_RB_MCH(mod_sym_t **txdataF,
                            uint32_t *jj,
                            uint16_t re_offset,
@@ -1373,7 +1375,7 @@ int dlsch_modulation(mod_sym_t **txdataF,
         mod_order0,
         mod_order1,
         rb_alloc[0],
-        l);
+        len);
 #endif
 
     if (frame_parms->Ncp==0) { // normal prefix
@@ -1559,7 +1561,8 @@ int dlsch_modulation(mod_sym_t **txdataF,
 
       if (rb_alloc_ind > 0) {
         //    printf("Allocated rb %d/symbol %d, skip_half %d, subframe_offset %d, symbol_offset %d, re_offset %d, jj %d\n",rb,l,skip_half,subframe_offset,symbol_offset,re_offset,jj);
-        allocate_REs_in_RB(frame_parms,
+ 
+	  	allocate_REs_in_RB(frame_parms,
                            txdataF,
                            &jj,
                            &jj2,
@@ -1603,6 +1606,159 @@ int dlsch_modulation(mod_sym_t **txdataF,
   return (re_allocated);
 }
 
+
+int dlsch_modulation_SIC(mod_sym_t **sic_buffer,
+                     int16_t amp,
+                     uint32_t subframe_offset,
+                     LTE_DL_FRAME_PARMS *frame_parms,
+                     uint8_t num_pdcch_symbols,
+                     LTE_eNB_DLSCH_t *dlsch0,
+                     LTE_eNB_DLSCH_t *dlsch1,
+		     int G)
+{
+
+  uint8_t nsymb;
+  uint8_t harq_pid = dlsch0->current_harq_pid;
+  LTE_DL_eNB_HARQ_t *dlsch0_harq = dlsch0->harq_processes[harq_pid];
+  LTE_DL_eNB_HARQ_t *dlsch1_harq; //= dlsch1->harq_processes[harq_pid];
+  uint32_t i,jj,re_allocated;
+  uint16_t l,rb,re_offset;
+  uint32_t *rb_alloc = dlsch0_harq->rb_alloc;
+  uint8_t mod_order0 = get_Qm(dlsch0_harq->mcs);
+  uint8_t *x0  = dlsch0_harq->e;
+  uint8_t qam64_table_offset_re = 0;
+  uint8_t qam64_table_offset_im = 0;
+  uint8_t qam16_table_offset_re = 0;
+  uint8_t qam16_table_offset_im = 0;
+  int16_t gain_lin_QPSK;
+ #ifdef DEBUG_DLSCH_MODULATION
+  uint8_t Nl0 = dlsch0_harq->Nl;
+  uint8_t Nl1;
+#endif
+
+  VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_ENB_DLSCH_MODULATION, VCD_FUNCTION_IN);
+
+  amp=1; //we do full scale here for SIC
+  gain_lin_QPSK = (int16_t)((amp*ONE_OVER_SQRT2_Q15));
+  
+  jj = 0;
+  i = 0;
+  while (jj <= G-1) {
+       
+	re_allocated = re_allocated + 1;
+
+        switch (mod_order0) {
+        case 2:  //QPSK
+       /* TODO: handle more than 1 antenna */
+     //   printf("%d(%d) : %d,%d => ",tti_offset,*jj,((int16_t*)&txdataF[0][tti_offset])[0],((int16_t*)&txdataF[0][tti_offset])[1]);
+        
+          ((int16_t*)&sic_buffer[0][i])[0] += (x0[jj]==1) ? (-gain_lin_QPSK) : gain_lin_QPSK; //I //b_i
+          
+          jj = jj + 1;
+          
+	  ((int16_t*)&sic_buffer[0][i])[1] += (x0[jj]==1) ? (-gain_lin_QPSK) : gain_lin_QPSK; //Q //b_{i+1}
+          
+          jj = jj + 1;
+          
+	//  printf("%d,%d\n",((int16_t*)&sic_buffer[0][i])[0],((int16_t*)&sic_buffer[0][i])[1]);
+	  
+	  i++;
+          
+	  break;
+
+        case 4:  //16QAM
+
+          qam16_table_offset_re = 0;
+          qam16_table_offset_im = 0;
+
+          if (x0[jj] == 1)
+            qam16_table_offset_re+=2;
+
+          jj=jj+1;
+
+          if (x0[jj] == 1)
+            qam16_table_offset_im+=2;
+
+          jj=jj+1;
+
+
+          if (x0[jj] == 1)
+            qam16_table_offset_re+=1;
+
+          jj=jj+1;
+
+          if (x0[jj] == 1)
+            qam16_table_offset_im+=1;
+
+          jj=jj+1;
+
+
+            ((int16_t *)&sic_buffer[0][i])[0]+=qam16_table[qam16_table_offset_re];
+            ((int16_t *)&sic_buffer[0][i])[0]+=qam16_table[qam16_table_offset_im];
+	    
+           i++; 
+
+          break;
+
+        case 6:  //64QAM
+
+
+          qam64_table_offset_re = 0;
+          qam64_table_offset_im = 0;
+
+          if (x0[jj] == 1)
+            qam64_table_offset_re+=4;
+
+          jj=jj+1;
+
+          if (x0[jj] == 1)
+            qam64_table_offset_im+=4;
+
+          jj=jj+1;
+
+          if (x0[jj] == 1)
+            qam64_table_offset_re+=2;
+
+          jj=jj+1;
+
+          if (x0[jj] == 1)
+            qam64_table_offset_im+=2;
+
+          jj=jj+1;
+
+          if (x0[jj] == 1)
+            qam64_table_offset_re+=1;
+
+          jj=jj+1;
+
+          if (x0[jj] == 1)
+            qam64_table_offset_im+=1;
+
+          jj=jj+1;
+
+
+            ((int16_t *)&sic_buffer[0][i])[0]+=qam64_table[qam64_table_offset_re];//(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_re])>>15);
+            ((int16_t *)&sic_buffer[0][i])[0]+=qam64_table[qam64_table_offset_im];//(int16_t)(((int32_t)amp*qam64_table[qam64_table_offset_im])>>15);
+	   
+	   i++;
+          
+	   break;
+      }
+      
+      }
+
+
+#ifdef DEBUG_DLSCH_MODULATION
+  msg("generate_dlsch : jj = %d,re_allocated = %d (G %d)\n",jj,re_allocated,get_G(frame_parms,dlsch0_harq->nb_rb,dlsch0_harq->rb_alloc,mod_order0,Nl0,2,0,subframe_offset));
+#endif
+
+  VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_ENB_DLSCH_MODULATION, VCD_FUNCTION_OUT);
+
+  return (re_allocated);
+}
+
+
+
 int mch_modulation(mod_sym_t **txdataF,
                    int16_t amp,
                    uint32_t subframe_offset,
@@ -1647,7 +1803,7 @@ int mch_modulation(mod_sym_t **txdataF,
 
       if ((frame_parms->N_RB_DL&1) == 1) { // ODD N_RB_DL
 
-        if (rb==(frame_parms->N_RB_DL>>1))
+	  if (rb==(frame_parms->N_RB_DL>>1))
           skip_dc = 1;
         else
           skip_dc = 0;
diff --git a/openair1/PHY/LTE_TRANSPORT/power_control.c b/openair1/PHY/LTE_TRANSPORT/power_control.c
index 492f15c218..d0d2742443 100644
--- a/openair1/PHY/LTE_TRANSPORT/power_control.c
+++ b/openair1/PHY/LTE_TRANSPORT/power_control.c
@@ -29,8 +29,7 @@
 #include "PHY/defs.h"
 #include "PHY/impl_defs_lte.h"
 
-//#define DEBUG_PC 
-
+#define DEBUG_PC 0
 /*
 double ratioPB[2][4]={{ 1.0,4.0/5.0,3.0/5.0,2.0/5.0},
           { 5.0/4.0,1.0,3.0/4.0,1.0/2.0}};
diff --git a/openair1/PHY/LTE_TRANSPORT/proto.h b/openair1/PHY/LTE_TRANSPORT/proto.h
index 91f3503904..17436bf869 100644
--- a/openair1/PHY/LTE_TRANSPORT/proto.h
+++ b/openair1/PHY/LTE_TRANSPORT/proto.h
@@ -185,7 +185,6 @@ int32_t allocate_REs_in_RB(LTE_DL_FRAME_PARMS *frame_parms,
                            uint8_t skip_dc,
                            uint8_t skip_half);
 
-
 /** \fn int32_t dlsch_modulation(mod_sym_t **txdataF,
     int16_t amp,
     uint32_t sub_frame_offset,
@@ -210,6 +209,15 @@ int32_t dlsch_modulation(mod_sym_t **txdataF,
                          uint8_t num_pdcch_symbols,
                          LTE_eNB_DLSCH_t *dlsch0,
                          LTE_eNB_DLSCH_t *dlsch1);
+
+int32_t dlsch_modulation_SIC(mod_sym_t **sic_buffer,
+                         int16_t amp,
+                         uint32_t sub_frame_offset,
+                         LTE_DL_FRAME_PARMS *frame_parms,
+                         uint8_t num_pdcch_symbols,
+                         LTE_eNB_DLSCH_t *dlsch0,
+                         LTE_eNB_DLSCH_t *dlsch1,
+			 int G);
 /*
   \brief This function is the top-level routine for generation of the sub-frame signal (frequency-domain) for MCH.
   @param txdataF Table of pointers for frequency-domain TX signals
@@ -710,6 +718,16 @@ int32_t dlsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
    @param llr128p pointer to pointer to symbol in dlsch_llr
 */
 
+int32_t dlsch_qpsk_llr_SIC(LTE_DL_FRAME_PARMS *frame_parms,
+                       int **rxdataF_comp,
+		       mod_sym_t **sic_buffer,
+		       int **rho_i,
+                       short *dlsch_llr,
+		       uint8_t num_pdcch_symbols,
+                       uint16_t nb_rb,
+                       uint16_t pbch_pss_sss_adjust,
+		       LTE_UE_DLSCH_t *dlsch0);
+
 void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
                      int32_t **rxdataF_comp,
                      int16_t *dlsch_llr,
@@ -719,7 +737,6 @@ void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
                      uint16_t nb_rb,
                      uint16_t pbch_pss_sss_adjust,
                      int16_t **llr128p);
-
 /**
    \brief This function generates log-likelihood ratios (decoder input) for single-stream 16QAM received waveforms
    @param frame_parms Frame descriptor structure
@@ -732,6 +749,17 @@ void dlsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
    @param nb_rb number of RBs for this allocation
    @param pbch_pss_sss_adjust PBCH/PSS/SSS RE adjustment (in REs)
 */
+void dlsch_16qam_llr_SIC(LTE_DL_FRAME_PARMS *frame_parms,
+                         int32_t **rxdataF_comp,
+                         int16_t *dlsch_llr,
+                         int32_t **dl_ch_mag,
+                         uint8_t symbol,
+                         uint8_t first_symbol_flag,
+                         uint16_t nb_rb,
+                         uint16_t pbch_pss_sss_adjust,
+                         int16_t **llr128p);
+
+
 void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
                      int32_t **rxdataF_comp,
                      int16_t *dlsch_llr,
@@ -743,6 +771,17 @@ void dlsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
                      uint16_t pbch_pss_sss_adjust,
                      short **llr_save);
 
+void dlsch_64qam_llr_SIC(LTE_DL_FRAME_PARMS *frame_parms,
+                         int32_t **rxdataF_comp,
+                         int16_t *dlsch_llr,
+                         int32_t **dl_ch_mag,
+                         int32_t **dl_ch_magb,
+                         uint8_t symbol,
+                         uint8_t first_symbol_flag,
+                         uint16_t nb_rb,
+                         uint16_t pbch_pss_sss_adjust,
+                         short **llr_save);
+
 /** \fn dlsch_siso(LTE_DL_FRAME_PARMS *frame_parms,
     int32_t **rxdataF_comp,
     int32_t **rxdataF_comp_i,
diff --git a/openair1/PHY/TOOLS/cadd_sv.c b/openair1/PHY/TOOLS/cadd_sv.c
index 2fa052f6c9..2902f2017a 100644
--- a/openair1/PHY/TOOLS/cadd_sv.c
+++ b/openair1/PHY/TOOLS/cadd_sv.c
@@ -65,7 +65,7 @@ int add_cpx_vector(short *x,
 int add_vector32_scalar(short *x,
                         int alpha,
                         short *y,
-                        unsigned int N)
+                        unsigned int 
 {
   unsigned int i;                 // loop counter
 
diff --git a/openair1/PHY/TOOLS/cadd_vv.c b/openair1/PHY/TOOLS/cadd_vv.c
index 3bc349463e..8ab8f3a8e1 100644
--- a/openair1/PHY/TOOLS/cadd_vv.c
+++ b/openair1/PHY/TOOLS/cadd_vv.c
@@ -9,7 +9,7 @@
 
 
     OpenAirInterface is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITTOOLHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
 
@@ -176,6 +176,37 @@ int add_cpx_vector32(short *x,
   return(0);
 }
 
+int32_t sub_cpx_vector16(int16_t *x,
+                     int16_t *y,
+                     int16_t *z,
+                     uint32_t N)
+{
+  unsigned int i;                 // loop counter
+
+  __m128i *x_128;
+  __m128i *y_128;
+  __m128i *z_128;
+
+  x_128 = (__m128i *)&x[0];
+  y_128 = (__m128i *)&y[0];
+  z_128 = (__m128i *)&z[0];
+
+ for(i=0; i<(N>>3); i++) {
+    z_128[0] = _mm_subs_epi16(x_128[0],y_128[0]);
+    
+    x_128++;
+    y_128++;
+    z_128++;
+
+  }
+
+  _mm_empty();
+  _m_empty();
+  return(0);
+}
+
+
+
 int add_real_vector64(short *x,
                       short* y,
                       short *z,
diff --git a/openair1/PHY/TOOLS/cmult_sv.c b/openair1/PHY/TOOLS/cmult_sv.c
index 31ad77be82..a30bbcd321 100644
--- a/openair1/PHY/TOOLS/cmult_sv.c
+++ b/openair1/PHY/TOOLS/cmult_sv.c
@@ -66,18 +66,23 @@ void multadd_complex_vector_real_scalar(int16_t *x,
 
   if (zero_flag == 1)
     for (n=0; n<N>>2; n++) {
+      //print_shorts("x_128[n]=", &x_128[n]);
+      //print_shorts("alpha_128", &alpha_128);
       y_128[n] = mulhi_int16(x_128[n],alpha_128);
+      //print_shorts("y_128[n]=", &y_128[n]); //Q2.13
     }
 
   else
     for (n=0; n<N>>2; n++) {
       y_128[n] = adds_int16(y_128[n],mulhi_int16(x_128[n],alpha_128));
     }
-
+ 
   _mm_empty();
   _m_empty();
 
 }
+
+
 void multadd_real_vector_complex_scalar(int16_t *x,
                                         int16_t *alpha,
                                         int16_t *y,
@@ -387,7 +392,8 @@ int rotate_cpx_vector(int16_t *x,
     m3 = _mm_sra_epi32(m3,shift);        // shift right by shift in order to  compensate for the input amplitude
 
     y_128[0] = _mm_packs_epi32(m2,m3);        // pack in 16bit integers with saturation [re im re im re im re im]
-#elif defined(__arm__)
+    print_ints("y_128[0]=", &y_128[0]);
+    #elif defined(__arm__)
 
   ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]);
   ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]);
diff --git a/openair1/PHY/TOOLS/cmult_vv.c b/openair1/PHY/TOOLS/cmult_vv.c
index f970d33224..895bb4f166 100755
--- a/openair1/PHY/TOOLS/cmult_vv.c
+++ b/openair1/PHY/TOOLS/cmult_vv.c
@@ -34,8 +34,12 @@
 
 #if defined(__x86_64__) || defined(__i386__)
 int16_t conjug[8]__attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ;
+int16_t conjug2[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1} ;
+
 #define simd_q15_t __m128i
 #define simdshort_q15_t __m64
+#define set1_int16(a) _mm_set1_epi16(a)
+#define setr_int16(a0, a1, a2, a3, a4, a5, a6, a7) _mm_setr_epi16(a0, a1, a2, a3, a4, a5, a6, a7 )
 #elif defined(__arm__)
 int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;
 #define simd_q15_t int16x8_t
@@ -71,6 +75,7 @@ int mult_cpx_conj_vector(int16_t *x1,
 #if defined(__x86_64__) || defined(__i386__)
   simd_q15_t tmp_re,tmp_im;
   simd_q15_t tmpy0,tmpy1;
+  
 #elif defined(__arm__)
   int32x4_t tmp_re,tmp_im;
   int32x4_t tmp_re1,tmp_im1;
@@ -82,7 +87,7 @@ int mult_cpx_conj_vector(int16_t *x1,
   x2_128 = (simd_q15_t *)&x2[0];
   y_128  = (simd_q15_t *)&y[0];
 
-
+ 
   // we compute 4 cpx multiply for each loop
   for(i=0; i<(N>>2); i++) {
   #if defined(__x86_64__) || defined(__i386__)
@@ -131,3 +136,74 @@ int mult_cpx_conj_vector(int16_t *x1,
   return(0);
 }
 
+
+int mult_cpx_vector(int16_t *x1,
+                    int16_t *x2,
+                    int16_t *y,
+                    uint32_t N,
+                    int output_shift)
+{
+  // Multiply elementwise x1 with x2. 
+  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+  //            We assume x1 with a dinamic of 15 bit maximum
+  //
+  // x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+  //            We assume x2 with a dinamic of 14 bit maximum
+  ///
+  // y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+  //
+  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
+  //
+  // output_shift  - shift to be applied to generate output
+
+  uint32_t i;                 // loop counter
+
+  simd_q15_t *x1_128;
+  simd_q15_t *x2_128;
+  simd_q15_t *y_128;
+  simd_q15_t tmp_re,tmp_im;
+  simd_q15_t tmpy0,tmpy1;
+
+
+  x1_128 = (simd_q15_t *)&x1[0];
+  x2_128 = (simd_q15_t *)&x2[0];
+  y_128  = (simd_q15_t *)&y[0];
+  //print_shorts("x1_128:",&x1_128[0]);
+  //print_shorts("x2_128:",&x2_128[0]);
+  
+  //right shift by 13 while p_a * x0 and 15 while  
+  // we compute 4 cpx multiply for each loop
+  for(i=0; i<(N>>2); i++) {
+    tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]);
+    //print_shorts("tmp_re1:",&tmp_re[i]);
+    tmp_re = _mm_madd_epi16(tmp_re,*x2_128);
+    //print_ints("tmp_re2:",&tmp_re[i]);
+
+    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
+    //print_shorts("tmp_im1:",&tmp_im[i]);
+    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
+    //print_shorts("tmp_im2:",&tmp_im[i]);
+    tmp_im = _mm_madd_epi16(tmp_im, *x2_128);
+    //print_ints("tmp_im3:",&tmp_im[i]);
+    tmp_re = _mm_srai_epi32(tmp_re,output_shift);
+    //print_ints("tmp_re shifted:",&tmp_re[i]);
+    tmp_im = _mm_srai_epi32(tmp_im,output_shift);
+    //print_ints("tmp_im shifted:",&tmp_im[i]);
+    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im);
+    //print_ints("unpack lo :",&tmpy0[i]);
+    tmpy1  = _mm_unpackhi_epi32(tmp_re,tmp_im);
+    //print_ints("mrc rho0:",&tmpy1[i]);
+    *y_128 = _mm_packs_epi32(tmpy0,tmpy1);
+    //print_shorts("*y_128:",&y_128[i]);
+
+    x1_128++;
+    x2_128++;
+    y_128++;
+  }
+
+
+  _mm_empty();
+  _m_empty();
+
+  return(0);
+}
diff --git a/openair1/PHY/TOOLS/defs.h b/openair1/PHY/TOOLS/defs.h
index 963c1384b7..959469e382 100644
--- a/openair1/PHY/TOOLS/defs.h
+++ b/openair1/PHY/TOOLS/defs.h
@@ -131,6 +131,12 @@ int mult_cpx_conj_vector(int16_t *x1,
                          uint32_t N,
                          int output_shift);
 
+int mult_cpx_vector(int16_t *x1,
+                    int16_t  *x2,
+                    int16_t *y,
+                    uint32_t N,
+                    int output_shift);
+
 // lte_dfts.c
 void init_fft(uint16_t size,
               uint8_t logsize,
@@ -214,6 +220,11 @@ int32_t add_cpx_vector(int16_t *x,
                        int16_t *y,
                        uint32_t N);
 
+int32_t sub_cpx_vector16(int16_t *x,
+			  int16_t *y,
+			  int16_t *z,
+			  uint32_t N);
+
 int32_t add_cpx_vector32(int16_t *x,
                          int16_t *y,
                          int16_t *z,
diff --git a/openair1/PHY/TOOLS/lte_phy_scope.c b/openair1/PHY/TOOLS/lte_phy_scope.c
index fc6af36776..d5f3f3f5e7 100644
--- a/openair1/PHY/TOOLS/lte_phy_scope.c
+++ b/openair1/PHY/TOOLS/lte_phy_scope.c
@@ -768,13 +768,13 @@ void phy_scope_UE(FD_lte_phy_scope_ue *form,
 
     // PDSCH LLRs CW0
     if (pdsch_llr != NULL) {
-        for (i=0; i<coded_bits_per_codeword0/mod0; i++) {
-            llr[i] = (float) pdsch_llr[mod0*i];
+        for (i=0; i<coded_bits_per_codeword0; i++) {
+            llr[i] = (float) pdsch_llr[i];
             bit[i] = (float) i;
         }
 
-        fl_set_xyplot_xbounds(form->pdsch_llr,0,coded_bits_per_codeword0/mod0);        
-        fl_set_xyplot_data(form->pdsch_llr,bit,llr,coded_bits_per_codeword0/mod0,"","","");
+        fl_set_xyplot_xbounds(form->pdsch_llr,0,coded_bits_per_codeword0);        
+        fl_set_xyplot_data(form->pdsch_llr,bit,llr,coded_bits_per_codeword0,"","","");
     }
 
   // PDSCH I/Q of MF Output
@@ -836,13 +836,13 @@ void phy_scope_UE(FD_lte_phy_scope_ue *form,
 
     // PDSCH LLRs CW1
     if (pdsch_llr1 != NULL) {
-        for (i=0; i<coded_bits_per_codeword1/mod1; i++) {
-            llr[i] = (float) pdsch_llr1[mod1*i];
+        for (i=0; i<coded_bits_per_codeword1; i++) {
+            llr[i] = (float) pdsch_llr1[i];
             bit[i] = (float) i;
         }
 
-        fl_set_xyplot_xbounds(form->pdsch_llr1,0,coded_bits_per_codeword1/mod1);        
-        fl_set_xyplot_data(form->pdsch_llr1,bit,llr,coded_bits_per_codeword1/mod1,"","","");
+        fl_set_xyplot_xbounds(form->pdsch_llr1,0,coded_bits_per_codeword1);        
+        fl_set_xyplot_data(form->pdsch_llr1,bit,llr,coded_bits_per_codeword1,"","","");
     }
     
     // PDSCH I/Q of MF Output
diff --git a/openair1/SCHED/phy_procedures_lte_eNb.c b/openair1/SCHED/phy_procedures_lte_eNb.c
index 375c0dcd1f..99122c4ea1 100755
--- a/openair1/SCHED/phy_procedures_lte_eNb.c
+++ b/openair1/SCHED/phy_procedures_lte_eNb.c
@@ -3873,4 +3873,3 @@ void phy_procedures_eNB_lte(unsigned char subframe,PHY_VARS_eNB **phy_vars_eNB,u
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_ENB_LTE,0);
   stop_meas(&phy_vars_eNB[0]->phy_proc);
 }
-
diff --git a/openair1/SIMULATION/LTE_PHY/dlsim.c b/openair1/SIMULATION/LTE_PHY/dlsim.c
index 1e6a5a3e5b..3efbdb6994 100644
--- a/openair1/SIMULATION/LTE_PHY/dlsim.c
+++ b/openair1/SIMULATION/LTE_PHY/dlsim.c
@@ -253,14 +253,17 @@ int main(int argc, char **argv)
   int TB0_active = 1;
   uint32_t perfect_ce = 0;
 
-  //  LTE_DL_UE_HARQ_t *dlsch0_ue_harq;
-  //  LTE_DL_eNB_HARQ_t *dlsch0_eNB_harq;
+ LTE_DL_UE_HARQ_t *dlsch0_ue_harq;
+ LTE_DL_eNB_HARQ_t *dlsch0_eNB_harq;
   uint8_t Kmimo;
 
   mod_sym_t **sic_buffer;
+  int8_t cw_to_decode_interf;
+  int8_t cw_to_decode_interf_free;
+  int8_t  cw_non_sic;
+  int8_t  cw_sic;
   FILE    *proc_fd = NULL;
   char buf[64];
-
   uint8_t ue_category=4;
   uint32_t Nsoft;
 
@@ -537,7 +540,7 @@ int main(int argc, char **argv)
 	break;	
       case 'u':
 	rx_type = (RX_type_t) atoi(optarg);
-	if (rx_type<rx_standard || rx_type>rx_IC_dual_stream) {
+	if (rx_type<rx_standard || rx_type>rx_SIC_dual_stream) {
 	  printf("Unsupported rx type %d\n",rx_type);
 	  exit(-1);
 	}
@@ -639,7 +642,7 @@ int main(int argc, char **argv)
     printf("only standard rx available for TM1 and TM2\n");
     exit(-1);
   }
-  if (((transmission_mode==5) || (transmission_mode==6)) && (rx_type == rx_IC_dual_stream)) {
+  if (((transmission_mode==5) || (transmission_mode==6)) && (rx_type > rx_IC_single_stream)) {
     printf("only standard rx or single stream IC available for TM5 and TM6\n");
     exit(-1);
   }
@@ -2816,7 +2819,8 @@ n(tikz_fname,"w");
 					      &PHY_vars_eNB->lte_frame_parms,
 					      num_pdcch_symbols,
 					      PHY_vars_eNB->dlsch_eNB[k][0],
-					      PHY_vars_eNB->dlsch_eNB[k][1]);
+					      PHY_vars_eNB->dlsch_eNB[k][1]
+ 					      );
 	      stop_meas(&PHY_vars_eNB->dlsch_modulation_stats);	      
 	      /*
 	      if (trials==0 && round==0)
@@ -3438,34 +3442,42 @@ n(tikz_fname,"w");
 	      }
 	    }
 	  }
-
-	  for (int cw=0; cw<Kmimo;cw++){
-	    PHY_vars_UE->dlsch_ue[0][cw]->rnti = (common_flag==0) ? n_rnti: SI_RNTI;
+	  
+	  if (rx_type==rx_SIC_dual_stream){
+	    cw_to_decode_interf=1;
+	    cw_to_decode_interf_free=1;
+	  }
+	    else {
+	    cw_to_decode_interf=Kmimo; 
+	    } 
+	    
+	  for (cw_non_sic=0; cw_non_sic<cw_to_decode_interf; cw_non_sic++){
+	    PHY_vars_UE->dlsch_ue[0][cw_non_sic]->rnti = (common_flag==0) ? n_rnti: SI_RNTI;
 	    coded_bits_per_codeword = get_G(&PHY_vars_eNB->lte_frame_parms,
-					    PHY_vars_eNB->dlsch_eNB[0][cw]->harq_processes[0]->nb_rb,
-					    PHY_vars_eNB->dlsch_eNB[0][cw]->harq_processes[0]->rb_alloc,
-					    get_Qm(PHY_vars_eNB->dlsch_eNB[0][cw]->harq_processes[0]->mcs),
-					    PHY_vars_eNB->dlsch_eNB[0][cw]->harq_processes[0]->Nl,
+					    PHY_vars_eNB->dlsch_eNB[0][cw_non_sic]->harq_processes[0]->nb_rb,
+					    PHY_vars_eNB->dlsch_eNB[0][cw_non_sic]->harq_processes[0]->rb_alloc,
+					    get_Qm(PHY_vars_eNB->dlsch_eNB[0][cw_non_sic]->harq_processes[0]->mcs),
+					    PHY_vars_eNB->dlsch_eNB[0][cw_non_sic]->harq_processes[0]->Nl,
 					    num_pdcch_symbols,
 					    0,subframe);
-	    PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw]->current_harq_pid]->G = coded_bits_per_codeword;
-	    PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw]->current_harq_pid]->Qm = get_Qm(PHY_vars_eNB->dlsch_eNB[0][cw]->harq_processes[0]->mcs);
+	    PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw_non_sic]->current_harq_pid]->G = coded_bits_per_codeword;
+	    PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw_non_sic]->current_harq_pid]->Qm = get_Qm(PHY_vars_eNB->dlsch_eNB[0][cw_non_sic]->harq_processes[0]->mcs);
 	    if (n_frames==1) {
-	      printf("Kmimo=%d, cw=%d, G=%d, TBS=%d\n",Kmimo,cw,coded_bits_per_codeword,
-		     PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw]->current_harq_pid]->TBS);
+	      printf("Kmimo=%d, cw=%d, G=%d, TBS=%d\n",Kmimo,cw_non_sic,coded_bits_per_codeword,
+		     PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw_non_sic]->current_harq_pid]->TBS);
 	    
 	      // calculate uncoded BER
 	      uncoded_ber_bit = (short*) malloc(sizeof(short)*coded_bits_per_codeword);
 	      AssertFatal(uncoded_ber_bit, "uncoded_ber_bit==NULL");
-	      sprintf(fname,"dlsch%d_rxF_r%d_cw%d_llr.m",eNB_id,round, cw);
-	      sprintf(vname,"dl%d_r%d_cw%d_llr",eNB_id,round, cw);
-	      write_output(fname,vname, PHY_vars_UE->lte_ue_pdsch_vars[0]->llr[cw],coded_bits_per_codeword,1,0);
-	      sprintf(fname,"dlsch_cw%d_e.m", cw);
-	      sprintf(vname,"dlsch_cw%d_e", cw);
-	       write_output(fname, vname,PHY_vars_eNB->dlsch_eNB[0][cw]->harq_processes[0]->e,coded_bits_per_codeword,1,4);
+	      sprintf(fname,"dlsch%d_rxF_r%d_cw%d_llr.m",eNB_id,round, cw_non_sic);
+	      sprintf(vname,"dl%d_r%d_cw%d_llr",eNB_id,round, cw_non_sic);
+	      write_output(fname,vname, PHY_vars_UE->lte_ue_pdsch_vars[0]->llr[cw_non_sic],coded_bits_per_codeword,1,0);
+	      sprintf(fname,"dlsch_cw%d_e.m", cw_non_sic);
+	      sprintf(vname,"dlsch_cw%d_e", cw_non_sic);
+	       write_output(fname, vname,PHY_vars_eNB->dlsch_eNB[0][cw_non_sic]->harq_processes[0]->e,coded_bits_per_codeword,1,4);
 	      uncoded_ber=0;
 	      for (i=0;i<coded_bits_per_codeword;i++) 
-		if (PHY_vars_eNB->dlsch_eNB[0][cw]->harq_processes[0]->e[i] != (PHY_vars_UE->lte_ue_pdsch_vars[0]->llr[cw][i]<0)) {
+		if (PHY_vars_eNB->dlsch_eNB[0][cw_non_sic]->harq_processes[0]->e[i] != (PHY_vars_UE->lte_ue_pdsch_vars[0]->llr[cw_non_sic][i]<0)) {
 		  uncoded_ber_bit[i] = 1;
 		  uncoded_ber++;
 		}
@@ -3474,10 +3486,10 @@ n(tikz_fname,"w");
 	      
 	      uncoded_ber/=coded_bits_per_codeword;
 	      avg_ber += uncoded_ber;
-	      sprintf(fname,"cw%d_uncoded_ber_bit.m", cw);
-	      sprintf(vname,"uncoded_ber_bit_cw%d", cw);
+	      sprintf(fname,"cw%d_uncoded_ber_bit.m", cw_non_sic);
+	      sprintf(vname,"uncoded_ber_bit_cw%d", cw_non_sic);
 	      write_output(fname, vname,uncoded_ber_bit,coded_bits_per_codeword,1,0);
-	      printf("cw %d, uncoded ber %f\n",cw,uncoded_ber);
+	      printf("cw %d, uncoded ber %f\n",cw_non_sic,uncoded_ber);
 	      
 
 	      free(uncoded_ber_bit);
@@ -3488,59 +3500,61 @@ n(tikz_fname,"w");
 	    start_meas(&PHY_vars_UE->dlsch_unscrambling_stats);	      
 	    dlsch_unscrambling(&PHY_vars_UE->lte_frame_parms,
 			       0,
-			       PHY_vars_UE->dlsch_ue[0][cw],
+			       PHY_vars_UE->dlsch_ue[0][cw_non_sic],
 			       coded_bits_per_codeword,
-			       PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->llr[cw],
-			       cw,
+			       PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->llr[cw_non_sic],
+			       cw_non_sic,
 			       subframe<<1);
 	    stop_meas(&PHY_vars_UE->dlsch_unscrambling_stats);	      
 
 	    start_meas(&PHY_vars_UE->dlsch_decoding_stats);
 	    ret = dlsch_decoding(PHY_vars_UE,
-				 PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->llr[cw],		 
+				 PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->llr[cw_non_sic],		 
 				 &PHY_vars_UE->lte_frame_parms,
-				 PHY_vars_UE->dlsch_ue[0][cw],
-				 PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw]->current_harq_pid],
+				 PHY_vars_UE->dlsch_ue[0][cw_non_sic],
+				 PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw_non_sic]->current_harq_pid],
 				 subframe,
-				 PHY_vars_UE->dlsch_ue[0][cw]->current_harq_pid,
+				 PHY_vars_UE->dlsch_ue[0][cw_non_sic]->current_harq_pid,
 				 1,llr8_flag);
 	    stop_meas(&PHY_vars_UE->dlsch_decoding_stats); 
-	    
-	    
-	    if (ret <= PHY_vars_UE->dlsch_ue[0][cw]->max_turbo_iterations ) { 
-
-	      if (cw==0) {		
+	   
+	     
+	    if (ret <= PHY_vars_UE->dlsch_ue[0][cw_non_sic]->max_turbo_iterations ) { 
+                 printf("ret=%d\n", ret);
+	      if (cw_non_sic==0) {		
 		avg_iter += ret;
 		iter_trials++;
 	      }
 	      
 	      if (n_frames==1) {
-		printf("cw %d, round %d: No DLSCH errors found, uncoded ber %f\n",cw,round,uncoded_ber);
+		printf("cw %d, round %d: No DLSCH errors found, uncoded ber %f\n",cw_non_sic,round,uncoded_ber);
 #ifdef PRINT_BYTES
-		for (s=0;s<PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->C;s++) {
-		  if (s<PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->Cminus)
-		    Kr = PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->Kminus;
+		for (s=0;s<PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->C;s++) {
+		  if (s<PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->Cminus)
+		    Kr = PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->Kminus;
 		  else
-		    Kr = PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->Kplus;
+		    Kr = PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->Kplus;
 		  
 		  Kr_bytes = Kr>>3;
 		  
 		  printf("Decoded_output (Segment %d):\n",s);
 		  for (i=0;i<Kr_bytes;i++)
-		    printf("%d : %x (%x)\n",i,PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->c[s][i],
-			   PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->c[s][i]^PHY_vars_eNB->dlsch_eNB[0][cw]->harq_processes[0]->c[s][i]);
+		    printf("%d : %x (%x)\n",i,PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->c[s][i],
+			   PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->c[s][i]^PHY_vars_eNB->dlsch_eNB[0][cw_non_sic]->harq_processes[0]->c[s][i]);
 		}
 #endif
 	      }
 	      
-	      PHY_vars_UE->total_TBS[eNB_id] =  PHY_vars_UE->total_TBS[eNB_id] + PHY_vars_UE->dlsch_ue[eNB_id][cw]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][cw]->current_harq_pid]->TBS;
+	      PHY_vars_UE->total_TBS[eNB_id] =  PHY_vars_UE->total_TBS[eNB_id] + PHY_vars_UE->dlsch_ue[eNB_id][cw_non_sic]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][cw_non_sic]->current_harq_pid]->TBS;
 
+	      // If the  receiver is NOT SIC, Here we are done with both CW, now only to calculate BLER  
+	      //If the receiver IS SIC, we are done only with CW0, CW1 was only compensated by this moment (y1' obtained)
 	      if (PHY_vars_UE->dlsch_ue[eNB_id][0]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid]->mimo_mode == LARGE_CDD) {   //try to decode second stream using SIC
 	      /*
 	      //for (round = 0 ; round < PHY_vars_UE->dlsch_ue[eNB_id][0]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid]->round ; round++) {
 	      // we assume here that the second stream has a lower MCS and is thus more likely to be decoded
 	      // re-encoding of second stream
-	      dlsch0_ue_harq = PHY_vars_UE->dlsch_ue[eNB_id][1]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid];
+	      dlsch0_ue_harq = PHY_vars_UE->dlsch_ue[eNB_id][0]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid];
 	      dlsch0_eNB_harq = PHY_vars_UE->dlsch_eNB[eNB_id]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid];
 	      dlsch0_eNB_harq->mimo_mode    = LARGE_CDD;
 	      dlsch0_eNB_harq->rb_alloc[0]  = dlsch0_ue_harq->rb_alloc[0];
@@ -3554,11 +3568,11 @@ n(tikz_fname,"w");
 	      dlsch0_eNB_harq->dl_power_off = dlsch0_ue_harq->dl_power_off;
 	      dlsch0_eNB_harq->status       = dlsch0_ue_harq->status;
 	      
-	      PHY_vars_UE->dlsch_eNB[eNB_id]->active       = PHY_vars_UE->dlsch_ue[eNB_id][1]->active;
-	      PHY_vars_UE->dlsch_eNB[eNB_id]->rnti         = PHY_vars_UE->dlsch_ue[eNB_id][1]->rnti;
-	      PHY_vars_UE->dlsch_eNB[eNB_id]->current_harq_pid         = PHY_vars_UE->dlsch_ue[eNB_id][1]->current_harq_pid;
+	      PHY_vars_UE->dlsch_eNB[eNB_id]->active       = PHY_vars_UE->dlsch_ue[eNB_id][0]->active;
+	      PHY_vars_UE->dlsch_eNB[eNB_id]->rnti         = PHY_vars_UE->dlsch_ue[eNB_id][0]->rnti;
+	      PHY_vars_UE->dlsch_eNB[eNB_id]->current_harq_pid         = PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid;
 	      
-	      dlsch_encoding(PHY_vars_UE->dlsch_ue[eNB_id][1]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][1]->current_harq_pid]->b,
+	      dlsch_encoding(PHY_vars_UE->dlsch_ue[eNB_id][0]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid]->b,
 			     &PHY_vars_UE->lte_frame_parms,
 			     num_pdcch_symbols,
 			     PHY_vars_UE->dlsch_eNB[eNB_id],
@@ -3630,36 +3644,242 @@ n(tikz_fname,"w");
 	      //}
 	      
 	      }
-	    }	
+	      
+	if ((PHY_vars_UE->dlsch_ue[eNB_id][0]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid]->mimo_mode >=DUALSTREAM_UNIFORM_PRECODING1) &&
+			(PHY_vars_UE->dlsch_ue[eNB_id][0]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid]->mimo_mode <=DUALSTREAM_PUSCH_PRECODING) &&
+			rx_type==rx_SIC_dual_stream) {
+	  
+	  for (round = 0 ; round < 1 ; round++) {
+	    dlsch0_ue_harq = PHY_vars_UE->dlsch_ue[eNB_id][0]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid];
+	    dlsch0_eNB_harq = PHY_vars_UE->dlsch_eNB[eNB_id]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid];
+	      
+	    dlsch0_eNB_harq->mimo_mode    = PHY_vars_UE->dlsch_ue[eNB_id][0]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid]->mimo_mode;
+	    dlsch0_eNB_harq->rb_alloc[0]  = dlsch0_ue_harq->rb_alloc_even[0];
+	    dlsch0_eNB_harq->nb_rb        = dlsch0_ue_harq->nb_rb;
+	    dlsch0_eNB_harq->mcs          = dlsch0_ue_harq->mcs;
+	    dlsch0_eNB_harq->rvidx        = dlsch0_ue_harq->rvidx;
+	    dlsch0_eNB_harq->Nl           = dlsch0_ue_harq->Nl;
+	    dlsch0_eNB_harq->round        = dlsch0_ue_harq->round;
+	    dlsch0_eNB_harq->TBS          = dlsch0_ue_harq->TBS;
+	    dlsch0_eNB_harq->dl_power_off = dlsch0_ue_harq->dl_power_off;
+	    dlsch0_eNB_harq->status       = dlsch0_ue_harq->status;
+	      
+	    PHY_vars_UE->dlsch_eNB[eNB_id]->active                   = PHY_vars_UE->dlsch_ue[eNB_id][0]->active;
+	    PHY_vars_UE->dlsch_eNB[eNB_id]->rnti                     = PHY_vars_UE->dlsch_ue[eNB_id][0]->rnti;
+	    PHY_vars_UE->dlsch_eNB[eNB_id]->current_harq_pid         = PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid;
+	    dlsch_encoding(PHY_vars_UE->dlsch_ue[eNB_id][0]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][0]->current_harq_pid]->b,
+			   &PHY_vars_UE->lte_frame_parms,
+			   num_pdcch_symbols,
+			   PHY_vars_UE->dlsch_eNB[eNB_id],
+			   0,
+			   subframe,
+			   &PHY_vars_UE->dlsch_rate_matching_stats,
+			   &PHY_vars_UE->dlsch_turbo_encoding_stats,
+			   &PHY_vars_UE->dlsch_interleaving_stats);
+	      
+	    coded_bits_per_codeword = get_G(&PHY_vars_UE->lte_frame_parms,
+					    PHY_vars_UE->dlsch_eNB[eNB_id]->harq_processes[PHY_vars_UE->dlsch_eNB[eNB_id]->current_harq_pid]->nb_rb,
+					    PHY_vars_UE->dlsch_eNB[eNB_id]->harq_processes[PHY_vars_UE->dlsch_eNB[eNB_id]->current_harq_pid]->rb_alloc,
+					    get_Qm(PHY_vars_UE->dlsch_eNB[eNB_id]->harq_processes[PHY_vars_UE->dlsch_eNB[eNB_id]->current_harq_pid]->mcs),
+					    PHY_vars_UE->dlsch_eNB[eNB_id]->harq_processes[PHY_vars_UE->dlsch_eNB[eNB_id]->current_harq_pid]->Nl,
+					    num_pdcch_symbols,
+					    0,
+				            subframe);
+
+	    dlsch_scrambling(&PHY_vars_UE->lte_frame_parms,
+			     0,
+			     PHY_vars_UE->dlsch_eNB[eNB_id],
+			     coded_bits_per_codeword,
+			     0,
+			     subframe<<1);
+
+	    re_allocated = dlsch_modulation_SIC(sic_buffer,
+					        AMP,
+					        subframe,
+					        &PHY_vars_UE->lte_frame_parms,
+					        num_pdcch_symbols,
+					        &PHY_vars_UE->dlsch_eNB[0][0],
+					        NULL,
+					        coded_bits_per_codeword);
+	      
+	    write_output("sic_buffer.m","sic", *sic_buffer,re_allocated,1,1);
+	    write_output("rxdataF_comp1.m","rxF_comp1", *PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->rxdataF_comp1[PHY_vars_UE->dlsch_ue[0][0]->current_harq_pid][round],14*12*25,1,1);
+	    write_output("rxdataF_rho.m","rho", *PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->dl_ch_rho_ext[PHY_vars_UE->dlsch_ue[0][0]->current_harq_pid][round],14*12*25,1,1);
+
+	          
+	    dlsch_qpsk_llr_SIC(&PHY_vars_UE->lte_frame_parms,
+			       PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->rxdataF_comp1[PHY_vars_UE->dlsch_ue[0][0]->current_harq_pid][round],
+			       sic_buffer,
+		               PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->dl_ch_rho_ext[PHY_vars_UE->dlsch_ue[0][0]->current_harq_pid][round],
+                               PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->llr[1],
+		               num_pdcch_symbols,
+                               dlsch0_eNB_harq->nb_rb,
+                               adjust_G2(&PHY_vars_UE->lte_frame_parms,&dlsch0_eNB_harq->rb_alloc[0],2,subframe,num_pdcch_symbols),
+		               PHY_vars_UE->dlsch_ue[eNB_id][0]);
+	        }// round
+	
+            write_output("rxdata_llr1.m","llr1", PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->llr[1],re_allocated*2,1,0);
+	
+	    for (cw_sic=cw_to_decode_interf_free; cw_sic<cw_to_decode_interf_free+1;cw_sic++){
+	      PHY_vars_UE->dlsch_ue[0][cw_sic]->rnti = (common_flag==0) ? n_rnti: SI_RNTI;
+	      coded_bits_per_codeword = get_G(&PHY_vars_eNB->lte_frame_parms,
+					      PHY_vars_eNB->dlsch_eNB[0][cw_sic]->harq_processes[0]->nb_rb,
+					      PHY_vars_eNB->dlsch_eNB[0][cw_sic]->harq_processes[0]->rb_alloc,
+					      get_Qm(PHY_vars_eNB->dlsch_eNB[0][cw_sic]->harq_processes[0]->mcs),
+					      PHY_vars_eNB->dlsch_eNB[0][cw_sic]->harq_processes[0]->Nl,
+					      num_pdcch_symbols,
+					      0,
+				              subframe);
+	      
+	      PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw_sic]->current_harq_pid]->G = coded_bits_per_codeword;
+	      PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw_sic]->current_harq_pid]->Qm = get_Qm(PHY_vars_eNB->dlsch_eNB[0][cw_sic]->harq_processes[0]->mcs);
+	    
+	      if (n_frames==1) {
+	        printf("Kmimo=%d, cw=%d, G=%d, TBS=%d\n",Kmimo,cw_sic,coded_bits_per_codeword,
+		        PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw_sic]->current_harq_pid]->TBS);
+	    
+	      // calculate uncoded BER
+	        uncoded_ber_bit = (short*) malloc(sizeof(short)*coded_bits_per_codeword);
+	        AssertFatal(uncoded_ber_bit, "uncoded_ber_bit==NULL");
+	        sprintf(fname,"dlsch%d_rxF_r%d_cw%d_llr.m",eNB_id,round, cw_sic);
+	        sprintf(vname,"dl%d_r%d_cw%d_llr",eNB_id,round, cw_sic);
+	        write_output(fname,vname, PHY_vars_UE->lte_ue_pdsch_vars[0]->llr[cw_sic],coded_bits_per_codeword,1,0);
+	        sprintf(fname,"dlsch_cw%d_e.m", cw_sic);
+	        sprintf(vname,"dlsch_cw%d_e", cw_sic);
+	        write_output(fname, vname,PHY_vars_eNB->dlsch_eNB[0][cw_sic]->harq_processes[0]->e,coded_bits_per_codeword,1,4);
+	        uncoded_ber=0;
+	          for (i=0;i<coded_bits_per_codeword;i++) 
+		    if (PHY_vars_eNB->dlsch_eNB[0][cw_sic]->harq_processes[0]->e[i] != (PHY_vars_UE->lte_ue_pdsch_vars[0]->llr[cw_sic][i]<0)) {
+		      uncoded_ber_bit[i] = 1;
+		      uncoded_ber++;
+		    } else
+		      uncoded_ber_bit[i] = 0;
+	      
+	       uncoded_ber/=coded_bits_per_codeword;
+	       avg_ber += uncoded_ber;
+	       sprintf(fname,"cw%d_uncoded_ber_bit.m", cw_sic);
+	       sprintf(vname,"uncoded_ber_bit_cw%d", cw_sic);
+	       write_output(fname, vname,uncoded_ber_bit,coded_bits_per_codeword,1,0);
+	       printf("cw %d, uncoded ber %f\n",cw_sic,uncoded_ber);     
+               free(uncoded_ber_bit);
+	       uncoded_ber_bit = NULL;  
+	    }
+
+	    start_meas(&PHY_vars_UE->dlsch_unscrambling_stats);	      
+	    dlsch_unscrambling(&PHY_vars_UE->lte_frame_parms,
+			       0,
+			       PHY_vars_UE->dlsch_ue[0][cw_sic],
+			       coded_bits_per_codeword,
+			       PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->llr[cw_sic],
+			       cw_sic,
+			       subframe<<1);
+	    stop_meas(&PHY_vars_UE->dlsch_unscrambling_stats);	      
+
+	    start_meas(&PHY_vars_UE->dlsch_decoding_stats);
+	    ret = dlsch_decoding(PHY_vars_UE,
+				 PHY_vars_UE->lte_ue_pdsch_vars[eNB_id]->llr[cw_sic],		 
+				 &PHY_vars_UE->lte_frame_parms,
+				 PHY_vars_UE->dlsch_ue[0][cw_sic],
+				 PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw_sic]->current_harq_pid],
+				 subframe,
+				 PHY_vars_UE->dlsch_ue[0][cw_sic]->current_harq_pid,
+				 1,llr8_flag);
+	    stop_meas(&PHY_vars_UE->dlsch_decoding_stats); 
+	   
+	     
+	    if (ret <= PHY_vars_UE->dlsch_ue[0][cw_sic]->max_turbo_iterations ) { 
+	      if (cw_sic==1) {		
+	        avg_iter += ret;
+		iter_trials++;
+	      }
+	      
+	      if (n_frames==1) {
+		printf("cw %d, round %d: No DLSCH errors found, uncoded ber %f\n",cw_sic,round,uncoded_ber);
+#ifdef PRINT_BYTES
+		for (s=0;s<PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->C;s++) {
+		  if (s<PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->Cminus)
+		    Kr = PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->Kminus;
+		  else
+		    Kr = PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->Kplus;
+		  
+		  Kr_bytes = Kr>>3;
+		  
+		  printf("Decoded_output (Segment %d):\n",s);
+		  for (i=0;i<Kr_bytes;i++)
+		    printf("%d : %x (%x)\n",i,PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->c[s][i],
+			   PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->c[s][i]^PHY_vars_eNB->dlsch_eNB[0][cw_sic]->harq_processes[0]->c[s][i]);
+		}
+#endif
+	      }
+	      
+	 //     PHY_vars_UE->total_TBS[eNB_id] =  PHY_vars_UE->total_TBS[eNB_id] + PHY_vars_UE->dlsch_ue[eNB_id][cw_sic]->harq_processes[PHY_vars_UE->dlsch_ue[eNB_id][cw_sic]->current_harq_pid]->TBS;
+
+	    } //if (ret <= PHY_vars_UE->dlsch_ue[0][cw_sic]->max_turbo_iterations ) 
+	   
+	    else {
+	      errs[cw_sic][round]++;
+	      
+	      if (cw_sic==0) {
+		avg_iter += ret-1;
+		iter_trials++;
+	      }
+	      
+	      if (n_frames==1) {
+		//if ((n_frames==1) || (SNR>=30)) {
+		printf("cw %d, round %d: DLSCH errors found, uncoded ber %f\n",cw_sic,round,uncoded_ber);
+#ifdef PRINT_BYTES
+		for (s=0;s<PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->C;s++) {
+		  if (s<PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->Cminus)
+		    Kr = PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->Kminus;
+		  else
+		    Kr = PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->Kplus;
+		  
+		  Kr_bytes = Kr>>3;
+		  
+		  printf("Decoded_output (Segment %d):\n",s);
+		  for (i=0;i<Kr_bytes;i++)
+		    printf("%d : %x (%x)\n",i,PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->c[s][i],
+			   PHY_vars_UE->dlsch_ue[0][cw_sic]->harq_processes[0]->c[s][i]^PHY_vars_eNB->dlsch_eNB[0][cw_sic]->harq_processes[0]->c[s][i]);
+		}
+#endif
+	      } //n_frames==1
+	    } //if (ret > PHY_vars_UE->dlsch_ue[0][cw_sic]->max_turbo_iterations ) 
+	  } //for (int cw_1=cw_to_decode_interf_free; cw_1<cw_to_decode_interf_free+1;cw_1++)
+	        
+	    
+	} //if SIC
+	    
+	    
+      } //if (ret <= PHY_vars_UE->dlsch_ue[0][cw_non_sic]->max_turbo_iterations )
 	    else {
-	      errs[cw][round]++;
+	      errs[cw_non_sic][round]++;
 	      
-	      if (cw==0) {
+	      if (cw_non_sic==0) {
 		avg_iter += ret-1;
 		iter_trials++;
 	      }
 	      
-	        if (cw==1) {
+	        if (cw_non_sic==1) {
 		avg_iter += ret-1;
 		iter_trials++;
 	      }
 	      
 	      if (n_frames==1) {
 		//if ((n_frames==1) || (SNR>=30)) {
-		printf("cw %d, round %d: DLSCH errors found, uncoded ber %f\n",cw,round,uncoded_ber);
+		printf("cw %d, round %d: DLSCH errors found, uncoded ber %f\n",cw_non_sic,round,uncoded_ber);
 #ifdef PRINT_BYTES
-		for (s=0;s<PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->C;s++) {
-		  if (s<PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->Cminus)
-		    Kr = PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->Kminus;
+		for (s=0;s<PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->C;s++) {
+		  if (s<PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->Cminus)
+		    Kr = PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->Kminus;
 		  else
-		    Kr = PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->Kplus;
+		    Kr = PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->Kplus;
 		  
 		  Kr_bytes = Kr>>3;
 		  
 		  printf("Decoded_output (Segment %d):\n",s);
 		  for (i=0;i<Kr_bytes;i++)
-		    printf("%d : %x (%x)\n",i,PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->c[s][i],
-			   PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[0]->c[s][i]^PHY_vars_eNB->dlsch_eNB[0][cw]->harq_processes[0]->c[s][i]);
+		    printf("%d : %x (%x)\n",i,PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->c[s][i],
+			   PHY_vars_UE->dlsch_ue[0][cw_non_sic]->harq_processes[0]->c[s][i]^PHY_vars_eNB->dlsch_eNB[0][cw_non_sic]->harq_processes[0]->c[s][i]);
 		}
 #endif
 	      }
-- 
GitLab