diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
index 0227a45a3de8d3e32fa5e0ba290240342cf2c391..99ac8cbf38e71e55749d95385e170f5fd5290805 100644
--- a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
+++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
@@ -132,18 +132,18 @@ void log_map8(llr_t* systematic,
   msg("log_map, frame_length %d\n",frame_length);
 #endif
 
-  start_meas(gamma_stats) ;
+  if (gamma_stats) start_meas(gamma_stats) ;
   compute_gamma8(m11,m10,systematic,y_parity,frame_length,term_flag) ;
-  stop_meas(gamma_stats);
-  start_meas(alpha_stats) ;
+  if (gamma_stats) stop_meas(gamma_stats);
+  if (alpha_stats) start_meas(alpha_stats) ;
   compute_alpha8(alpha,beta,m11,m10,frame_length,F)                  ;
-  stop_meas(alpha_stats);
-  start_meas(beta_stats)  ;
+  if (alpha_stats) stop_meas(alpha_stats);
+  if (beta_stats) start_meas(beta_stats)  ;
   compute_beta8(alpha,beta,m11,m10,frame_length,F,offset8_flag)      ;
-  stop_meas(beta_stats);
-  start_meas(ext_stats)   ;
+  if (beta_stats) stop_meas(beta_stats);
+  if (ext_stats) start_meas(ext_stats)   ;
   compute_ext8(alpha,beta,m11,m10,ext,systematic,frame_length)       ;
-  stop_meas(ext_stats);
+  if (ext_stats) stop_meas(ext_stats);
 
 
 }
@@ -963,7 +963,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
   }
 
 
-  start_meas(init_stats);
+  if (init_stats) start_meas(init_stats);
 
 
   if ((n&15)>0) {
@@ -1326,7 +1326,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
   msg("\n");
 #endif //DEBUG_LOGMAP
 
-  stop_meas(init_stats);
+  if (init_stats) stop_meas(init_stats);
 
   // do log_map from first parity bit
 
@@ -1338,7 +1338,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
     printf("\n*******************ITERATION %d (n %d, n2 %d), ext %p\n\n",iteration_cnt,n,n2,ext);
 #endif //DEBUG_LOGMAP
 
-    start_meas(intl1_stats);
+    if (intl1_stats) start_meas(intl1_stats);
     pi4_p=pi4tab8[iind];
 
     for (i=0; i<(n2>>4); i++) { // steady-state portion
@@ -1379,7 +1379,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
 #endif
     }
 
-    stop_meas(intl1_stats);
+    if (intl1_stats) stop_meas(intl1_stats);
 
     // do log_map from second parity bit
 
@@ -1484,7 +1484,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
 
     // Check if we decoded the block
     if (iteration_cnt>1) {
-      start_meas(intl2_stats);
+      if (intl2_stats) start_meas(intl2_stats);
 
       if ((n2&0x7f) == 0) {  // n2 is a multiple of 128 bits
 
@@ -1623,7 +1623,7 @@ unsigned char phy_threegpplte_turbo_decoder8(short *y,
         break;
       }
 
-      stop_meas(intl2_stats);
+      if (intl2_stats) stop_meas(intl2_stats);
 
       if ((crc == oldcrc) && (crc!=0)) {
         return(iteration_cnt);
diff --git a/openair1/PHY/INIT/lte_init.c b/openair1/PHY/INIT/lte_init.c
index 67d30bbb156e67039cf1668c27e715b31c278cce..52414f24b4fbb11184751d4d582e79fbb26eecb7 100644
--- a/openair1/PHY/INIT/lte_init.c
+++ b/openair1/PHY/INIT/lte_init.c
@@ -864,7 +864,8 @@ void phy_init_lte_top(LTE_DL_FRAME_PARMS *frame_parms)
   generate_16qam_table();
   generate_RIV_tables();
 
-
+  init_unscrambling_lut();
+  init_scrambling_lut();
   //set_taus_seed(1328);
 
 }
@@ -1107,6 +1108,7 @@ int phy_init_lte_ue(PHY_VARS_UE *ue,
 
   init_prach_tables(839);
 
+
   return 0;
 }
 
diff --git a/openair1/PHY/LTE_TRANSPORT/defs.h b/openair1/PHY/LTE_TRANSPORT/defs.h
index 090ae4637f5e756f9a90f67e5c662ad97768a046..26e2d40460ee0dc395ded546c09f6d24d8330945 100644
--- a/openair1/PHY/LTE_TRANSPORT/defs.h
+++ b/openair1/PHY/LTE_TRANSPORT/defs.h
@@ -143,7 +143,7 @@ typedef struct {
   /// downlink power offset field
   uint8_t dl_power_off;
   /// Concatenated "e"-sequences (for definition see 36-212 V8.6 2009-03, p.17-18)
-  uint8_t e[MAX_NUM_CHANNEL_BITS];
+  uint8_t e[MAX_NUM_CHANNEL_BITS] __attribute__((aligned(32)));
   /// Turbo-code outputs (36-212 V8.6 2009-03, p.12
   uint8_t *d[MAX_NUM_DLSCH_SEGMENTS];//[(96+3+(3*6144))];
   /// Sub-block interleaver outputs (36-212 V8.6 2009-03, p.16-17)
@@ -407,7 +407,7 @@ typedef struct {
   /// coded RI bits
   int16_t q_RI[MAX_RI_PAYLOAD];
   /// Concatenated "e"-sequences (for definition see 36-212 V8.6 2009-03, p.17-18)
-  int16_t e[MAX_NUM_CHANNEL_BITS];
+  int16_t e[MAX_NUM_CHANNEL_BITS] __attribute__((aligned(32)));
   /// Temporary h sequence to flag PUSCH_x/PUSCH_y symbols which are not scrambled
   uint8_t h[MAX_NUM_CHANNEL_BITS];
   /// Pointer to the payload
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c b/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c
index 366b93e71da8582be093edbc68ee21d0a9809bae..3e0af70deb9a9a145a8b33922698a68ca6e3aeb1 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_coding.c
@@ -67,35 +67,35 @@ void free_eNB_dlsch(LTE_eNB_DLSCH_t *dlsch)
 
   if (dlsch) {
 #ifdef DEBUG_DLSCH_FREE
-    msg("Freeing dlsch %p\n",dlsch);
+    printf("Freeing dlsch %p\n",dlsch);
 #endif
 
     for (i=0; i<dlsch->Mdlharq; i++) {
 #ifdef DEBUG_DLSCH_FREE
-      msg("Freeing dlsch process %d\n",i);
+      printf("Freeing dlsch process %d\n",i);
 #endif
 
       if (dlsch->harq_processes[i]) {
 #ifdef DEBUG_DLSCH_FREE
-        msg("Freeing dlsch process %d (%p)\n",i,dlsch->harq_processes[i]);
+        printf("Freeing dlsch process %d (%p)\n",i,dlsch->harq_processes[i]);
 #endif
 
         if (dlsch->harq_processes[i]->b) {
           free16(dlsch->harq_processes[i]->b,MAX_DLSCH_PAYLOAD_BYTES);
           dlsch->harq_processes[i]->b = NULL;
 #ifdef DEBUG_DLSCH_FREE
-          msg("Freeing dlsch process %d b (%p)\n",i,dlsch->harq_processes[i]->b);
+          printf("Freeing dlsch process %d b (%p)\n",i,dlsch->harq_processes[i]->b);
 #endif
         }
 
 #ifdef DEBUG_DLSCH_FREE
-        msg("Freeing dlsch process %d c (%p)\n",i,dlsch->harq_processes[i]->c);
+        printf("Freeing dlsch process %d c (%p)\n",i,dlsch->harq_processes[i]->c);
 #endif
 
         for (r=0; r<MAX_NUM_DLSCH_SEGMENTS; r++) {
 	  
 #ifdef DEBUG_DLSCH_FREE
-          msg("Freeing dlsch process %d c[%d] (%p)\n",i,r,dlsch->harq_processes[i]->c[r]);
+          printf("Freeing dlsch process %d c[%d] (%p)\n",i,r,dlsch->harq_processes[i]->c[r]);
 #endif
 	  
           if (dlsch->harq_processes[i]->c[r]) {
@@ -169,7 +169,7 @@ LTE_eNB_DLSCH_t *new_eNB_dlsch(unsigned char Kmimo,unsigned char Mdlharq,uint32_
         if (dlsch->harq_processes[i]->b) {
           bzero(dlsch->harq_processes[i]->b,MAX_DLSCH_PAYLOAD_BYTES/bw_scaling);
         } else {
-          msg("Can't get b\n");
+          printf("Can't get b\n");
           exit_flag=1;
         }
 
@@ -181,19 +181,19 @@ LTE_eNB_DLSCH_t *new_eNB_dlsch(unsigned char Kmimo,unsigned char Mdlharq,uint32_
             if (dlsch->harq_processes[i]->c[r]) {
               bzero(dlsch->harq_processes[i]->c[r],((r==0)?8:0) + 3+ 768);
             } else {
-              msg("Can't get c\n");
+              printf("Can't get c\n");
               exit_flag=2;
             }
             if (dlsch->harq_processes[i]->d[r]) {
               bzero(dlsch->harq_processes[i]->d[r],(96+12+3+(3*6144)));
             } else {
-              msg("Can't get d\n");
+              printf("Can't get d\n");
               exit_flag=2;
             }
           }
         }
       } else {
-        msg("Can't get harq_p %d\n",i);
+        printf("Can't get harq_p %d\n",i);
         exit_flag=3;
       }
     }
@@ -254,8 +254,301 @@ void clean_eNb_dlsch(LTE_eNB_DLSCH_t *dlsch)
 }
 
 
-int dlsch_encoding(unsigned char *a,
-                   LTE_DL_FRAME_PARMS *frame_parms,
+int dlsch_encoding_2threads0(te_params *tep) {
+
+  LTE_eNB_DLSCH_t *dlsch          = tep->dlsch;
+  unsigned int G                  = tep->G;
+
+  unsigned short iind;
+  unsigned char harq_pid = dlsch->current_harq_pid;
+  unsigned short nb_rb = dlsch->harq_processes[harq_pid]->nb_rb;
+  unsigned int Kr=0,Kr_bytes,r,r_offset=0;
+  unsigned short m=dlsch->harq_processes[harq_pid]->mcs;
+
+
+  VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_ENB_DLSCH_ENCODING, VCD_FUNCTION_IN);
+
+
+
+
+  if (dlsch->harq_processes[harq_pid]->round == 0) {  // this is a new packet
+
+    for (r=0; r<dlsch->harq_processes[harq_pid]->C>>1; r++) {
+
+      if (r<dlsch->harq_processes[harq_pid]->Cminus)
+        Kr = dlsch->harq_processes[harq_pid]->Kminus;
+      else
+        Kr = dlsch->harq_processes[harq_pid]->Kplus;
+
+      Kr_bytes = Kr>>3;
+
+      // get interleaver index for Turbo code (lookup in Table 5.1.3-3 36-212, V8.6 2009-03, p. 13-14)
+      if (Kr_bytes<=64)
+        iind = (Kr_bytes-5);
+      else if (Kr_bytes <=128)
+        iind = 59 + ((Kr_bytes-64)>>1);
+      else if (Kr_bytes <= 256)
+        iind = 91 + ((Kr_bytes-128)>>2);
+      else if (Kr_bytes <= 768)
+        iind = 123 + ((Kr_bytes-256)>>3);
+      else {
+        printf("dlsch_coding: Illegal codeword size %d!!!\n",Kr_bytes);
+        return(-1);
+      }
+
+
+
+      threegpplte_turbo_encoder(dlsch->harq_processes[harq_pid]->c[r],
+                                Kr>>3,
+                                &dlsch->harq_processes[harq_pid]->d[r][96],
+                                (r==0) ? dlsch->harq_processes[harq_pid]->F : 0,
+                                f1f2mat_old[iind*2],   // f1 (see 36121-820, page 14)
+                                f1f2mat_old[(iind*2)+1]  // f2 (see 36121-820, page 14)
+                               );
+      dlsch->harq_processes[harq_pid]->RTC[r] =
+        sub_block_interleaving_turbo(4+(Kr_bytes*8),
+                                     &dlsch->harq_processes[harq_pid]->d[r][96],
+                                     dlsch->harq_processes[harq_pid]->w[r]);
+    }
+
+  }
+
+  // Fill in the "e"-sequence from 36-212, V8.6 2009-03, p. 16-17 (for each "e") and concatenate the
+  // outputs for each code segment, see Section 5.1.5 p.20
+
+  for (r=0; r<dlsch->harq_processes[harq_pid]->C>>1; r++) {
+    r_offset += lte_rate_matching_turbo(dlsch->harq_processes[harq_pid]->RTC[r],
+                                        G,  //G
+                                        dlsch->harq_processes[harq_pid]->w[r],
+                                        dlsch->harq_processes[harq_pid]->e+r_offset,
+                                        dlsch->harq_processes[harq_pid]->C, // C
+                                        dlsch->Nsoft,                    // Nsoft,
+                                        dlsch->Mdlharq,
+                                        dlsch->Kmimo,
+                                        dlsch->harq_processes[harq_pid]->rvidx,
+                                        get_Qm(dlsch->harq_processes[harq_pid]->mcs),
+                                        dlsch->harq_processes[harq_pid]->Nl,
+                                        r,
+                                        nb_rb,
+                                        m);                       // r
+  }
+
+  VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_ENB_DLSCH_ENCODING, VCD_FUNCTION_OUT);
+
+  return(0);
+}
+
+extern int oai_exit;
+void *te_thread(void *param) {
+
+  eNB_proc_t *proc = &((te_params *)param)->eNB->proc;
+  while (!oai_exit) {
+
+    if (wait_on_condition(&proc->mutex_te,&proc->cond_te,&proc->instance_cnt_te,"te thread")<0) break;  
+
+    dlsch_encoding_2threads0((te_params*)param);
+
+    if (release_thread(&proc->mutex_te,&proc->instance_cnt_te,"te thread")<0) break;
+
+    if (pthread_cond_signal(&proc->cond_te) != 0) {
+      printf("[eNB] ERROR pthread_cond_signal for te thread exit\n");
+      exit_fun( "ERROR pthread_cond_signal" );
+      return(NULL);
+    }
+  }
+
+  return(NULL);
+}
+
+int dlsch_encoding_2threads(PHY_VARS_eNB *eNB,
+			    unsigned char *a,
+			    uint8_t num_pdcch_symbols,
+			    LTE_eNB_DLSCH_t *dlsch,
+			    int frame,
+			    uint8_t subframe,
+			    time_stats_t *rm_stats,
+			    time_stats_t *te_stats,
+			    time_stats_t *i_stats)
+{
+
+  LTE_DL_FRAME_PARMS *frame_parms = &eNB->frame_parms;
+  eNB_proc_t *proc = &eNB->proc;
+  unsigned int G;
+  unsigned int crc=1;
+  unsigned short iind;
+
+  unsigned char harq_pid = dlsch->current_harq_pid;
+  unsigned short nb_rb = dlsch->harq_processes[harq_pid]->nb_rb;
+  unsigned int A;
+  unsigned char mod_order;
+  unsigned int Kr=0,Kr_bytes,r,r_offset=0;
+  unsigned short m=dlsch->harq_processes[harq_pid]->mcs;
+
+  struct timespec wait;
+
+  wait.tv_sec=0;
+  wait.tv_nsec=5000000L;
+
+  VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_ENB_DLSCH_ENCODING, VCD_FUNCTION_IN);
+
+  A = dlsch->harq_processes[harq_pid]->TBS; //6228
+  mod_order = get_Qm(dlsch->harq_processes[harq_pid]->mcs);
+  G = get_G(frame_parms,nb_rb,dlsch->harq_processes[harq_pid]->rb_alloc,mod_order,dlsch->harq_processes[harq_pid]->Nl,num_pdcch_symbols,frame,subframe);
+
+
+  if (dlsch->harq_processes[harq_pid]->round == 0) {  // this is a new packet
+
+    // Add 24-bit crc (polynomial A) to payload
+    crc = crc24a(a,
+                 A)>>8;
+    a[A>>3] = ((uint8_t*)&crc)[2];
+    a[1+(A>>3)] = ((uint8_t*)&crc)[1];
+    a[2+(A>>3)] = ((uint8_t*)&crc)[0];
+
+    dlsch->harq_processes[harq_pid]->B = A+24;
+    memcpy(dlsch->harq_processes[harq_pid]->b,a,(A/8)+4);
+
+    if (lte_segmentation(dlsch->harq_processes[harq_pid]->b,
+                         dlsch->harq_processes[harq_pid]->c,
+                         dlsch->harq_processes[harq_pid]->B,
+                         &dlsch->harq_processes[harq_pid]->C,
+                         &dlsch->harq_processes[harq_pid]->Cplus,
+                         &dlsch->harq_processes[harq_pid]->Cminus,
+                         &dlsch->harq_processes[harq_pid]->Kplus,
+                         &dlsch->harq_processes[harq_pid]->Kminus,
+                         &dlsch->harq_processes[harq_pid]->F)<0)
+      return(-1);
+
+
+
+    if (proc->instance_cnt_te==0) {
+      printf("[eNB] TE thread busy\n");
+      exit_fun("TE thread busy");
+      pthread_mutex_unlock( &proc->mutex_te );
+      return(-1);
+    }
+  
+    ++proc->instance_cnt_te;
+
+    proc->tep.eNB               = eNB;
+    proc->tep.dlsch             = dlsch;
+    proc->tep.G                 = G;
+
+    // wakeup worker to do second half segments 
+    if (pthread_cond_signal(&proc->cond_te) != 0) {
+      printf("[eNB] ERROR pthread_cond_signal for te thread exit\n");
+      exit_fun( "ERROR pthread_cond_signal" );
+      return (-1);
+    }
+
+    pthread_mutex_unlock( &proc->mutex_te );
+
+    for (r=dlsch->harq_processes[harq_pid]->C>>1; r<dlsch->harq_processes[harq_pid]->C; r++) {
+
+      if (r<dlsch->harq_processes[harq_pid]->Cminus)
+        Kr = dlsch->harq_processes[harq_pid]->Kminus;
+      else
+        Kr = dlsch->harq_processes[harq_pid]->Kplus;
+
+      Kr_bytes = Kr>>3;
+
+      // get interleaver index for Turbo code (lookup in Table 5.1.3-3 36-212, V8.6 2009-03, p. 13-14)
+      if (Kr_bytes<=64)
+        iind = (Kr_bytes-5);
+      else if (Kr_bytes <=128)
+        iind = 59 + ((Kr_bytes-64)>>1);
+      else if (Kr_bytes <= 256)
+        iind = 91 + ((Kr_bytes-128)>>2);
+      else if (Kr_bytes <= 768)
+        iind = 123 + ((Kr_bytes-256)>>3);
+      else {
+        printf("dlsch_coding: Illegal codeword size %d!!!\n",Kr_bytes);
+        return(-1);
+      }
+
+
+      start_meas(te_stats);
+      threegpplte_turbo_encoder(dlsch->harq_processes[harq_pid]->c[r],
+                                Kr>>3,
+                                &dlsch->harq_processes[harq_pid]->d[r][96],
+                                (r==0) ? dlsch->harq_processes[harq_pid]->F : 0,
+                                f1f2mat_old[iind*2],   // f1 (see 36121-820, page 14)
+                                f1f2mat_old[(iind*2)+1]  // f2 (see 36121-820, page 14)
+                               );
+      stop_meas(te_stats);
+
+      start_meas(i_stats);
+      dlsch->harq_processes[harq_pid]->RTC[r] =
+        sub_block_interleaving_turbo(4+(Kr_bytes*8),
+                                     &dlsch->harq_processes[harq_pid]->d[r][96],
+                                     dlsch->harq_processes[harq_pid]->w[r]);
+      stop_meas(i_stats);
+    }
+
+  }
+  else {
+
+    proc->tep.eNB          = eNB;
+    proc->tep.dlsch        = dlsch;
+    proc->tep.G            = G;
+    
+    // wakeup worker to do second half segments 
+    if (pthread_cond_signal(&proc->cond_te) != 0) {
+      printf("[eNB] ERROR pthread_cond_signal for te thread exit\n");
+      exit_fun( "ERROR pthread_cond_signal" );
+      return (-1);
+    }
+  }
+
+  // Fill in the "e"-sequence from 36-212, V8.6 2009-03, p. 16-17 (for each "e") and concatenate the
+  // outputs for each code segment, see Section 5.1.5 p.20
+
+  for (r=0; r<dlsch->harq_processes[harq_pid]->C; r++) {
+
+    // get information for E for the segments that are handled by the worker thread
+    if (r<(dlsch->harq_processes[harq_pid]->C>>1)) {
+      int Nl=dlsch->harq_processes[harq_pid]->Nl;
+      int Qm=get_Qm(dlsch->harq_processes[harq_pid]->mcs);
+      int C = dlsch->harq_processes[harq_pid]->C;
+      int Gp = G/Nl/Qm;
+      int GpmodC = Gp%C;
+      if (r < (C-(GpmodC)))
+	r_offset += Nl*Qm * (Gp/C);
+      else
+	r_offset += Nl*Qm * ((GpmodC==0?0:1) + (Gp/C));
+    }
+    else  {
+      start_meas(rm_stats);
+      r_offset += lte_rate_matching_turbo(dlsch->harq_processes[harq_pid]->RTC[r],
+					  G,  //G
+					  dlsch->harq_processes[harq_pid]->w[r],
+					  dlsch->harq_processes[harq_pid]->e+r_offset,
+					  dlsch->harq_processes[harq_pid]->C, // C
+					  dlsch->Nsoft,                    // Nsoft,
+					  dlsch->Mdlharq,
+					  dlsch->Kmimo,
+					  dlsch->harq_processes[harq_pid]->rvidx,
+					  get_Qm(dlsch->harq_processes[harq_pid]->mcs),
+					  dlsch->harq_processes[harq_pid]->Nl,
+					  r,
+					  nb_rb,
+					  m);                       // r
+      stop_meas(rm_stats);
+    }
+  }
+
+  // wait for worker to finish
+
+  wait_on_busy_condition(&proc->mutex_te,&proc->cond_te,&proc->instance_cnt_te,"te thread");  
+
+  
+  VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_ENB_DLSCH_ENCODING, VCD_FUNCTION_OUT);
+
+  return(0);
+}
+
+int dlsch_encoding(PHY_VARS_eNB *eNB,
+		   unsigned char *a,
                    uint8_t num_pdcch_symbols,
                    LTE_eNB_DLSCH_t *dlsch,
                    int frame,
@@ -269,6 +562,7 @@ int dlsch_encoding(unsigned char *a,
   unsigned int crc=1;
   unsigned short iind;
 
+  LTE_DL_FRAME_PARMS *frame_parms = &eNB->frame_parms;
   unsigned char harq_pid = dlsch->current_harq_pid;
   unsigned short nb_rb = dlsch->harq_processes[harq_pid]->nb_rb;
   unsigned int A;
@@ -319,6 +613,7 @@ int dlsch_encoding(unsigned char *a,
       return(-1);
 
     for (r=0; r<dlsch->harq_processes[harq_pid]->C; r++) {
+
       if (r<dlsch->harq_processes[harq_pid]->Cminus)
         Kr = dlsch->harq_processes[harq_pid]->Kminus;
       else
@@ -336,7 +631,7 @@ int dlsch_encoding(unsigned char *a,
       else if (Kr_bytes <= 768)
         iind = 123 + ((Kr_bytes-256)>>3);
       else {
-        msg("dlsch_coding: Illegal codeword size %d!!!\n",Kr_bytes);
+        printf("dlsch_coding: Illegal codeword size %d!!!\n",Kr_bytes);
         return(-1);
       }
 
@@ -345,15 +640,15 @@ int dlsch_encoding(unsigned char *a,
       printf("Generating Code Segment %d (%d bits)\n",r,Kr);
       // generate codewords
 
-      msg("bits_per_codeword (Kr)= %d, A %d\n",Kr,A);
-      msg("N_RB = %d\n",nb_rb);
-      msg("Ncp %d\n",frame_parms->Ncp);
-      msg("mod_order %d\n",mod_order);
+      printf("bits_per_codeword (Kr)= %d, A %d\n",Kr,A);
+      printf("N_RB = %d\n",nb_rb);
+      printf("Ncp %d\n",frame_parms->Ncp);
+      printf("mod_order %d\n",mod_order);
 #endif
 
 
 #ifdef DEBUG_DLSCH_CODING
-      msg("Encoding ... iind %d f1 %d, f2 %d\n",iind,f1f2mat_old[iind*2],f1f2mat_old[(iind*2)+1]);
+      printf("Encoding ... iind %d f1 %d, f2 %d\n",iind,f1f2mat_old[iind*2],f1f2mat_old[(iind*2)+1]);
 #endif
       start_meas(te_stats);
       threegpplte_turbo_encoder(dlsch->harq_processes[harq_pid]->c[r],
@@ -385,7 +680,7 @@ int dlsch_encoding(unsigned char *a,
 
   for (r=0; r<dlsch->harq_processes[harq_pid]->C; r++) {
 #ifdef DEBUG_DLSCH_CODING
-    msg("Rate Matching, Code segment %d (coded bits (G) %d,unpunctured/repeated bits per code segment %d,mod_order %d, nb_rb %d)...\n",
+    printf("Rate Matching, Code segment %d (coded bits (G) %d,unpunctured/repeated bits per code segment %d,mod_order %d, nb_rb %d)...\n",
         r,
         G,
         Kr*3,
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c
index 48b3f53be4bd265ae7cb6a8023d7cda4e09b7b2e..e80a86b7cdf7f10626c12c360ed4c0b5f0d5cfc0 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_modulation.c
@@ -325,47 +325,74 @@ int allocate_REs_in_RB_no_pilots_64QAM_siso(LTE_DL_FRAME_PARMS *frame_parms,
 
   if (skip_dc == 0) {
 
+    x0p=&x0[*jj],tti_offset=symbol_offset+re_offset;
 
-    for (x0p=&x0[*jj],tti_offset=symbol_offset+re_offset,re=0; 
+    /*    for (x0p=&x0[*jj],tti_offset=symbol_offset+re_offset,re=0; 
 	 re<12; 
-	 re+=4,x0p+=24,tti_offset+=4) {
+	 re+=4,x0p+=24,tti_offset+=4) {*/
       
-      qam64_table_offset_re=FOUR[x0p[0]];
-      qam64_table_offset_im=FOUR[x0p[1]];
-      qam64_table_offset_re+=TWO[x0p[2]];
-      qam64_table_offset_im+=TWO[x0p[3]];
-      qam64_table_offset_re+=x0p[4];
-      qam64_table_offset_im+=x0p[5];
+      qam64_table_offset_re=(x0p[0]<<2)|(x0p[2]<<1)|x0p[4];
+      qam64_table_offset_im=(x0p[1]<<2)|(x0p[3]<<1)|x0p[5];
       ((int16_t *)&txdataF[0][tti_offset])[0]=qam_table_s0[qam64_table_offset_re];
       ((int16_t *)&txdataF[0][tti_offset])[1]=qam_table_s0[qam64_table_offset_im];
 
-      qam64_table_offset_re=FOUR[x0p[6]];
-      qam64_table_offset_im=FOUR[x0p[7]];
-      qam64_table_offset_re+=TWO[x0p[8]];
-      qam64_table_offset_im+=TWO[x0p[9]];
-      qam64_table_offset_re+=x0p[10];
-      qam64_table_offset_im+=x0p[11];
+      qam64_table_offset_re=(x0p[6]<<2)|(x0p[8]<<1)|x0p[10];
+      qam64_table_offset_im=(x0p[7]<<2)|(x0p[9]<<1)|x0p[11];
       ((int16_t *)&txdataF[0][tti_offset])[2]=qam_table_s0[qam64_table_offset_re];
       ((int16_t *)&txdataF[0][tti_offset])[3]=qam_table_s0[qam64_table_offset_im];
 
-      qam64_table_offset_re=FOUR[x0p[12]];
-      qam64_table_offset_im=FOUR[x0p[13]];
-      qam64_table_offset_re+=TWO[x0p[14]];
-      qam64_table_offset_im+=TWO[x0p[15]];
-      qam64_table_offset_re+=x0p[16];
-      qam64_table_offset_im+=x0p[17];
+      qam64_table_offset_re=(x0p[12]<<2)|(x0p[14]<<1)|x0p[16];
+      qam64_table_offset_im=(x0p[13]<<2)|(x0p[15]<<1)|x0p[17];
       ((int16_t *)&txdataF[0][tti_offset])[4]=qam_table_s0[qam64_table_offset_re];
       ((int16_t *)&txdataF[0][tti_offset])[5]=qam_table_s0[qam64_table_offset_im];
 
-      qam64_table_offset_re=FOUR[x0p[18]];
-      qam64_table_offset_im=FOUR[x0p[19]];
-      qam64_table_offset_re+=TWO[x0p[20]];
-      qam64_table_offset_im+=TWO[x0p[21]];
-      qam64_table_offset_re+=x0p[22];
-      qam64_table_offset_im+=x0p[23];
+      qam64_table_offset_re=(x0p[18]<<2)|(x0p[20]<<1)|x0p[22];
+      qam64_table_offset_im=(x0p[19]<<2)|(x0p[21]<<1)|x0p[23];
       ((int16_t *)&txdataF[0][tti_offset])[6]=qam_table_s0[qam64_table_offset_re];
       ((int16_t *)&txdataF[0][tti_offset])[7]=qam_table_s0[qam64_table_offset_im];
-    }
+
+      qam64_table_offset_re=(x0p[24]<<2)|(x0p[26]<<1)|x0p[28];
+      qam64_table_offset_im=(x0p[25]<<2)|(x0p[27]<<1)|x0p[29];
+      ((int16_t *)&txdataF[0][tti_offset])[8]=qam_table_s0[qam64_table_offset_re];
+      ((int16_t *)&txdataF[0][tti_offset])[9]=qam_table_s0[qam64_table_offset_im];
+
+      qam64_table_offset_re=(x0p[30]<<2)|(x0p[32]<<1)|x0p[34];
+      qam64_table_offset_im=(x0p[31]<<2)|(x0p[33]<<1)|x0p[35];
+      ((int16_t *)&txdataF[0][tti_offset])[10]=qam_table_s0[qam64_table_offset_re];
+      ((int16_t *)&txdataF[0][tti_offset])[11]=qam_table_s0[qam64_table_offset_im];
+
+      qam64_table_offset_re=(x0p[36]<<2)|(x0p[38]<<1)|x0p[40];
+      qam64_table_offset_im=(x0p[37]<<2)|(x0p[39]<<1)|x0p[41];
+      ((int16_t *)&txdataF[0][tti_offset])[12]=qam_table_s0[qam64_table_offset_re];
+      ((int16_t *)&txdataF[0][tti_offset])[13]=qam_table_s0[qam64_table_offset_im];
+
+      qam64_table_offset_re=(x0p[42]<<2)|(x0p[44]<<1)|x0p[46];
+      qam64_table_offset_im=(x0p[43]<<2)|(x0p[45]<<1)|x0p[47];
+      ((int16_t *)&txdataF[0][tti_offset])[14]=qam_table_s0[qam64_table_offset_re];
+      ((int16_t *)&txdataF[0][tti_offset])[15]=qam_table_s0[qam64_table_offset_im];
+
+      qam64_table_offset_re=(x0p[48]<<2)|(x0p[50]<<1)|x0p[52];
+      qam64_table_offset_im=(x0p[49]<<2)|(x0p[51]<<1)|x0p[53];
+      ((int16_t *)&txdataF[0][tti_offset])[16]=qam_table_s0[qam64_table_offset_re];
+      ((int16_t *)&txdataF[0][tti_offset])[17]=qam_table_s0[qam64_table_offset_im];
+
+      qam64_table_offset_re=(x0p[54]<<2)|(x0p[56]<<1)|x0p[58];
+      qam64_table_offset_im=(x0p[55]<<2)|(x0p[57]<<1)|x0p[59];
+      ((int16_t *)&txdataF[0][tti_offset])[18]=qam_table_s0[qam64_table_offset_re];
+      ((int16_t *)&txdataF[0][tti_offset])[19]=qam_table_s0[qam64_table_offset_im];
+
+      qam64_table_offset_re=(x0p[60]<<2)|(x0p[62]<<1)|x0p[64];
+      qam64_table_offset_im=(x0p[61]<<2)|(x0p[63]<<1)|x0p[65];
+      ((int16_t *)&txdataF[0][tti_offset])[20]=qam_table_s0[qam64_table_offset_re];
+      ((int16_t *)&txdataF[0][tti_offset])[21]=qam_table_s0[qam64_table_offset_im];
+
+      qam64_table_offset_re=(x0p[66]<<2)|(x0p[68]<<1)|x0p[70];
+      qam64_table_offset_im=(x0p[67]<<2)|(x0p[69]<<1)|x0p[71];
+      ((int16_t *)&txdataF[0][tti_offset])[22]=qam_table_s0[qam64_table_offset_re];
+      ((int16_t *)&txdataF[0][tti_offset])[23]=qam_table_s0[qam64_table_offset_im];
+
+
+      //    }
   }
   else {
     for (x0p=&x0[*jj],tti_offset=symbol_offset+re_offset,re=0; 
@@ -1668,6 +1695,69 @@ int dlsch_modulation(int32_t **txdataF,
     re_offset = frame_parms->first_carrier_offset;
     symbol_offset = (uint32_t)frame_parms->ofdm_symbol_size*(l+(subframe_offset*nsymb));
 
+    allocate_REs = allocate_REs_in_RB;
+    
+    switch (mod_order0) {
+    case 2:
+      qam_table_s0 = NULL;
+      break;
+    case 4:
+      if (pilots) {
+	qam_table_s0 = qam16_table_b0; 
+	allocate_REs = (dlsch0->harq_processes[harq_pid]->mimo_mode == SISO) ? 
+	  allocate_REs_in_RB_pilots_16QAM_siso :
+	  allocate_REs_in_RB;
+      }
+      else {
+	qam_table_s0 = qam16_table_a0;
+	allocate_REs = (dlsch0->harq_processes[harq_pid]->mimo_mode == SISO) ? 
+	  allocate_REs_in_RB_no_pilots_16QAM_siso :
+	  allocate_REs_in_RB;
+	
+      }
+      break;
+      
+    case 6:
+      if (pilots) {
+	qam_table_s0 = qam64_table_b0; 
+	allocate_REs = (dlsch0->harq_processes[harq_pid]->mimo_mode == SISO) ? 
+	  allocate_REs_in_RB_pilots_64QAM_siso :
+	  allocate_REs_in_RB;
+      }
+      else {
+	qam_table_s0 = qam64_table_a0;
+	allocate_REs = (dlsch0->harq_processes[harq_pid]->mimo_mode == SISO) ? 
+	  allocate_REs_in_RB_no_pilots_64QAM_siso :
+	  allocate_REs_in_RB;
+      }
+      break;
+      
+    }
+    
+    switch (mod_order1) {
+    case 2:
+      qam_table_s1 = NULL;
+      allocate_REs = allocate_REs_in_RB;
+      break;
+    case 4:
+      if (pilots) {
+	qam_table_s1 = qam16_table_b1; 
+      }
+      else {
+	qam_table_s1 = qam16_table_a1;
+      }
+      break;
+    case 6:
+      if (pilots) {
+	qam_table_s1 = qam64_table_b1; 
+      }
+      else {
+	qam_table_s1 = qam64_table_a1;
+      }
+      break;
+      
+    }
+
     //for (aa=0;aa<frame_parms->nb_antennas_tx;aa++)
     //  memset(&txdataF[aa][symbol_offset],0,frame_parms->ofdm_symbol_size<<2);
     //printf("symbol_offset %d,subframe offset %d : pilots %d\n",symbol_offset,subframe_offset,pilots);
@@ -1816,68 +1906,7 @@ int dlsch_modulation(int32_t **txdataF,
         }
       }
 
-      allocate_REs = allocate_REs_in_RB;
-
-      switch (mod_order0) {
-      case 2:
-	qam_table_s0 = NULL;
-	break;
-      case 4:
-	if (pilots) {
-	  qam_table_s0 = qam16_table_b0; 
-	  allocate_REs = (dlsch0->harq_processes[harq_pid]->mimo_mode == SISO) ? 
-	    allocate_REs_in_RB_pilots_16QAM_siso :
-	    allocate_REs_in_RB;
-	}
-	else {
-	  qam_table_s0 = qam16_table_a0;
-	  allocate_REs = (dlsch0->harq_processes[harq_pid]->mimo_mode == SISO) ? 
-	    allocate_REs_in_RB_no_pilots_16QAM_siso :
-	    allocate_REs_in_RB;
-	  
-	}
-	break;
-
-      case 6:
-	if (pilots) {
-	  qam_table_s0 = qam64_table_b0; 
-	  allocate_REs = (dlsch0->harq_processes[harq_pid]->mimo_mode == SISO) ? 
-	                 allocate_REs_in_RB_pilots_64QAM_siso :
-	                 allocate_REs_in_RB;
-	}
-	else {
-	  qam_table_s0 = qam64_table_a0;
-	  allocate_REs = (dlsch0->harq_processes[harq_pid]->mimo_mode == SISO) ? 
-	                 allocate_REs_in_RB_no_pilots_64QAM_siso :
-	                 allocate_REs_in_RB;
-	}
-	break;
-      
-      }
 
-      switch (mod_order1) {
-      case 2:
-	qam_table_s1 = NULL;
-	allocate_REs = allocate_REs_in_RB;
-	break;
-      case 4:
-	if (pilots) {
-	  qam_table_s1 = qam16_table_b1; 
-	}
-	else {
-	  qam_table_s1 = qam16_table_a1;
-	}
-	break;
-      case 6:
-	if (pilots) {
-	  qam_table_s1 = qam64_table_b1; 
-	}
-	else {
-	  qam_table_s1 = qam64_table_a1;
-	}
-	break;
-      
-      }
 
       if (rb_alloc_ind > 0) {
 	//	printf("Allocated rb %d/symbol %d, skip_half %d, subframe_offset %d, symbol_offset %d, re_offset %d, jj %d\n",rb,l,skip_half,subframe_offset,symbol_offset,re_offset,jj);
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_scrambling.c b/openair1/PHY/LTE_TRANSPORT/dlsch_scrambling.c
index a5dd4bf76470ed30542892307ea4fad2fdd49455..804c8bcbe7f84d877aedc5065985b932344ab73e 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_scrambling.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_scrambling.c
@@ -48,6 +48,34 @@
 #include "PHY/extern.h"
 #include "UTIL/LOG/vcd_signal_dumper.h"
 
+static inline unsigned int lte_gold_scram(unsigned int *x1, unsigned int *x2, unsigned char reset) __attribute__((always_inline));
+static inline unsigned int lte_gold_scram(unsigned int *x1, unsigned int *x2, unsigned char reset)
+{
+  int n;
+
+  if (reset) {
+    *x1 = 1+ (1<<31);
+    *x2=*x2 ^ ((*x2 ^ (*x2>>1) ^ (*x2>>2) ^ (*x2>>3))<<31);
+
+    // skip first 50 double words (1600 bits)
+    //      printf("n=0 : x1 %x, x2 %x\n",x1,x2);
+    for (n=1; n<50; n++) {
+      *x1 = (*x1>>1) ^ (*x1>>4);
+      *x1 = *x1 ^ (*x1<<31) ^ (*x1<<28);
+      *x2 = (*x2>>1) ^ (*x2>>2) ^ (*x2>>3) ^ (*x2>>4);
+      *x2 = *x2 ^ (*x2<<31) ^ (*x2<<30) ^ (*x2<<29) ^ (*x2<<28);
+    }
+  }
+
+  *x1 = (*x1>>1) ^ (*x1>>4);
+  *x1 = *x1 ^ (*x1<<31) ^ (*x1<<28);
+  *x2 = (*x2>>1) ^ (*x2>>2) ^ (*x2>>3) ^ (*x2>>4);
+  *x2 = *x2 ^ (*x2<<31) ^ (*x2<<30) ^ (*x2<<29) ^ (*x2<<28);
+  return(*x1^*x2);
+  //  printf("n=%d : c %x\n",n,x1^x2);
+
+}
+
 void dlsch_scrambling(LTE_DL_FRAME_PARMS *frame_parms,
                       int mbsfn_flag,
                       LTE_eNB_DLSCH_t *dlsch,
@@ -75,53 +103,57 @@ void dlsch_scrambling(LTE_DL_FRAME_PARMS *frame_parms,
 #ifdef DEBUG_SCRAMBLING
   printf("scrambling: rnti %x, q %d, Ns %d, Nid_cell %d, length %d\n",dlsch->rnti,q,Ns,frame_parms->Nid_cell, G);
 #endif
-  s = lte_gold_generic(&x1, &x2, 1);
+  s = lte_gold_scram(&x1, &x2, 1);
 
   for (i=0; i<(1+(G>>5)); i++) {
 
 #ifdef DEBUG_SCRAMBLING
     printf("scrambling %d : %d => ",k,e[k]);
 #endif
+
+                
+    e[0] = (e[0]) ^ (s&1);      
+    e[1] = (e[1]) ^ ((s>>1)&1);      
+    e[2] = (e[2]) ^ ((s>>2)&1);      
+    e[3] = (e[3]) ^ ((s>>3)&1);      
+    e[4] = (e[4]) ^ ((s>>4)&1);      
+    e[5] = (e[5]) ^ ((s>>5)&1);      
+    e[6] = (e[6]) ^ ((s>>6)&1);      
+    e[7] = (e[7]) ^ ((s>>7)&1);      
+    e[8] = (e[8]) ^ ((s>>8)&1);      
+    e[9] = (e[9]) ^ ((s>>9)&1);      
+    e[10] = (e[10]) ^ ((s>>10)&1);      
+    e[11] = (e[11]) ^ ((s>>11)&1);      
+    e[12] = (e[12]) ^ ((s>>12)&1);      
+    e[13] = (e[13]) ^ ((s>>13)&1);      
+    e[14] = (e[14]) ^ ((s>>14)&1);      
+    e[15] = (e[15]) ^ ((s>>15)&1);      
+    e[16] = (e[16]) ^ ((s>>16)&1);      
+    e[17] = (e[17]) ^ ((s>>17)&1);      
+    e[18] = (e[18]) ^ ((s>>18)&1);      
+    e[19] = (e[19]) ^ ((s>>19)&1);      
+    e[20] = (e[20]) ^ ((s>>20)&1);      
+    e[21] = (e[21]) ^ ((s>>21)&1);      
+    e[22] = (e[22]) ^ ((s>>22)&1);      
+    e[23] = (e[23]) ^ ((s>>23)&1);      
+    e[24] = (e[24]) ^ ((s>>24)&1);      
+    e[25] = (e[25]) ^ ((s>>25)&1);      
+    e[26] = (e[26]) ^ ((s>>26)&1);      
+    e[27] = (e[27]) ^ ((s>>27)&1);      
+    e[28] = (e[28]) ^ ((s>>28)&1);      
+    e[29] = (e[29]) ^ ((s>>29)&1);      
+    e[30] = (e[30]) ^ ((s>>30)&1);      
+    e[31] = (e[31]) ^ ((s>>31)&1);      
     
-    e[0] = (e[0]&1) ^ (s&1);      
-    e[1] = (e[1]&1) ^ ((s>>1)&1);      
-    e[2] = (e[2]&1) ^ ((s>>2)&1);      
-    e[3] = (e[3]&1) ^ ((s>>3)&1);      
-    e[4] = (e[4]&1) ^ ((s>>4)&1);      
-    e[5] = (e[5]&1) ^ ((s>>5)&1);      
-    e[6] = (e[6]&1) ^ ((s>>6)&1);      
-    e[7] = (e[7]&1) ^ ((s>>7)&1);      
-    e[8] = (e[8]&1) ^ ((s>>8)&1);      
-    e[9] = (e[9]&1) ^ ((s>>9)&1);      
-    e[10] = (e[10]&1) ^ ((s>>10)&1);      
-    e[11] = (e[11]&1) ^ ((s>>11)&1);      
-    e[12] = (e[12]&1) ^ ((s>>12)&1);      
-    e[13] = (e[13]&1) ^ ((s>>13)&1);      
-    e[14] = (e[14]&1) ^ ((s>>14)&1);      
-    e[15] = (e[15]&1) ^ ((s>>15)&1);      
-    e[16] = (e[16]&1) ^ ((s>>16)&1);      
-    e[17] = (e[17]&1) ^ ((s>>17)&1);      
-    e[18] = (e[18]&1) ^ ((s>>18)&1);      
-    e[19] = (e[19]&1) ^ ((s>>19)&1);      
-    e[20] = (e[20]&1) ^ ((s>>20)&1);      
-    e[21] = (e[21]&1) ^ ((s>>21)&1);      
-    e[22] = (e[22]&1) ^ ((s>>22)&1);      
-    e[23] = (e[23]&1) ^ ((s>>23)&1);      
-    e[24] = (e[24]&1) ^ ((s>>24)&1);      
-    e[25] = (e[25]&1) ^ ((s>>25)&1);      
-    e[26] = (e[26]&1) ^ ((s>>26)&1);      
-    e[27] = (e[27]&1) ^ ((s>>27)&1);      
-    e[28] = (e[28]&1) ^ ((s>>28)&1);      
-    e[29] = (e[29]&1) ^ ((s>>29)&1);      
-    e[30] = (e[30]&1) ^ ((s>>30)&1);      
-    e[31] = (e[31]&1) ^ ((s>>31)&1);      
-    
+    // This is not faster for some unknown reason
+    //    ((__m128i *)e)[0] = _mm_xor_si128(((__m128i *)e)[0],((__m128i *)scrambling_lut)[s&65535]);
+    //    ((__m128i *)e)[1] = _mm_xor_si128(((__m128i *)e)[1],((__m128i *)scrambling_lut)[s>>16]);
 #ifdef DEBUG_SCRAMBLING
     printf("%d\n",e[k]);
 #endif
     
     
-    s = lte_gold_generic(&x1, &x2, 0);
+    s = lte_gold_scram(&x1, &x2, 0);
     e += 32;
   }
 
@@ -153,7 +185,7 @@ void dlsch_unscrambling(LTE_DL_FRAME_PARMS *frame_parms,
 #ifdef DEBUG_SCRAMBLING
   printf("unscrambling: rnti %x, q %d, Ns %d, Nid_cell %d length %d\n",dlsch->rnti,q,Ns,frame_parms->Nid_cell,G);
 #endif
-  s = lte_gold_generic(&x1, &x2, 1);
+  s = lte_gold_scram(&x1, &x2, 1);
 
   for (i=0; i<(1+(G>>5)); i++) {
     for (j=0; j<32; j++,k++) {
@@ -166,6 +198,30 @@ void dlsch_unscrambling(LTE_DL_FRAME_PARMS *frame_parms,
 #endif
     }
 
-    s = lte_gold_generic(&x1, &x2, 0);
+    s = lte_gold_scram(&x1, &x2, 0);
+  }
+}
+
+void init_unscrambling_lut() {
+
+  uint32_t s;
+  int i=0,j;
+
+  for (s=0;s<=65535;s++) {
+    for (j=0;j<16;j++) {
+      unscrambling_lut[i++] = (int16_t)((((s>>j)&1)<<1)-1);
+    }
+  }
+}
+
+void init_scrambling_lut() {
+
+  uint32_t s;
+  int i=0,j;
+
+  for (s=0;s<=65535;s++) {
+    for (j=0;j<16;j++) {
+      scrambling_lut[i++] = (uint8_t)((s>>j)&1);
+    }
   }
 }
diff --git a/openair1/PHY/LTE_TRANSPORT/proto.h b/openair1/PHY/LTE_TRANSPORT/proto.h
index d0d38b34a62c36ab0a623cac0551e33f4a6655da..f37e4be4801c18f618ae801b417f843dfa8d3c86 100644
--- a/openair1/PHY/LTE_TRANSPORT/proto.h
+++ b/openair1/PHY/LTE_TRANSPORT/proto.h
@@ -93,7 +93,8 @@ LTE_eNB_ULSCH_t *new_eNB_ulsch(uint8_t max_turbo_iterations,uint8_t N_RB_UL, uin
 
 LTE_UE_ULSCH_t *new_ue_ulsch(unsigned char N_RB_UL, uint8_t abstraction_flag);
 
-/** \fn dlsch_encoding(uint8_t *input_buffer,
+/** \fn dlsch_encoding(PHY_VARS_eNB *eNB,
+    uint8_t *input_buffer,
     LTE_DL_FRAME_PARMS *frame_parms,
     uint8_t num_pdcch_symbols,
     LTE_eNB_DLSCH_t *dlsch,
@@ -105,6 +106,7 @@ LTE_UE_ULSCH_t *new_ue_ulsch(unsigned char N_RB_UL, uint8_t abstraction_flag);
     - Channel coding (Turbo coding)
     - Rate matching (sub-block interleaving, bit collection, selection and transmission
     - Code block concatenation
+    @param eNB Pointer to eNB PHY context
     @param input_buffer Pointer to input buffer for sub-frame
     @param frame_parms Pointer to frame descriptor structure
     @param num_pdcch_symbols Number of PDCCH symbols in this subframe
@@ -116,8 +118,8 @@ LTE_UE_ULSCH_t *new_ue_ulsch(unsigned char N_RB_UL, uint8_t abstraction_flag);
     @param i_stats Time statistics for interleaving
     @returns status
 */
-int32_t dlsch_encoding(uint8_t *a,
-                       LTE_DL_FRAME_PARMS *frame_parms,
+int32_t dlsch_encoding(PHY_VARS_eNB *eNB,
+		       uint8_t *a,
                        uint8_t num_pdcch_symbols,
                        LTE_eNB_DLSCH_t *dlsch,
                        int frame,
@@ -126,6 +128,39 @@ int32_t dlsch_encoding(uint8_t *a,
                        time_stats_t *te_stats,
                        time_stats_t *i_stats);
 
+/** \fn dlsch_encoding_2threads(PHY_VARS_eNB *eNB,
+    uint8_t *input_buffer,
+    uint8_t num_pdcch_symbols,
+    LTE_eNB_DLSCH_t *dlsch,
+    int frame,
+    uint8_t subframe)
+    \brief This function performs a subset of the bit-coding functions for LTE as described in 36-212, Release 8.Support is limited to turbo-coded channels (DLSCH/ULSCH). This version spawns 1 worker thread. The implemented functions are:
+    - CRC computation and addition
+    - Code block segmentation and sub-block CRC addition
+    - Channel coding (Turbo coding)
+    - Rate matching (sub-block interleaving, bit collection, selection and transmission
+    - Code block concatenation
+    @param eNB Pointer to eNB PHY context
+    @param input_buffer Pointer to input buffer for sub-frame
+    @param num_pdcch_symbols Number of PDCCH symbols in this subframe
+    @param dlsch Pointer to dlsch to be encoded
+    @param frame Frame number
+    @param subframe Subframe number
+    @param rm_stats Time statistics for rate-matching
+    @param te_stats Time statistics for turbo-encoding
+    @param i_stats Time statistics for interleaving
+    @returns status
+*/
+int32_t dlsch_encoding_2threads(PHY_VARS_eNB *eNB,
+				uint8_t *a,
+				uint8_t num_pdcch_symbols,
+				LTE_eNB_DLSCH_t *dlsch,
+				int frame,
+				uint8_t subframe,
+				time_stats_t *rm_stats,
+				time_stats_t *te_stats,
+				time_stats_t *i_stats);
+
 void dlsch_encoding_emul(PHY_VARS_eNB *phy_vars_eNB,
                          uint8_t *DLSCH_pdu,
                          LTE_eNB_DLSCH_t *dlsch);
@@ -1544,6 +1579,32 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *phy_vars_eNB,
                              uint8_t Nbundled,
                              uint8_t llr8_flag);
 
+/*!
+  \brief Decoding of ULSCH data component from 36-212. This one spawns 1 worker thread in parallel,half of the segments in each thread.
+  @param phy_vars_eNB Pointer to eNB top-level descriptor
+  @param UE_id ID of UE transmitting this PUSCH
+  @param harq_pid HARQ process ID
+  @param llr8_flag If 1, indicate that the 8-bit turbo decoder should be used
+  @returns 0 on success
+*/
+int ulsch_decoding_data_2thread(PHY_VARS_eNB *eNB,
+				int UE_id,
+				int harq_pid,
+				int llr8_flag);
+
+/*!
+  \brief Decoding of ULSCH data component from 36-212. This one is single thread.
+  @param phy_vars_eNB Pointer to eNB top-level descriptor
+  @param UE_id ID of UE transmitting this PUSCH
+  @param harq_pid HARQ process ID
+  @param llr8_flag If 1, indicate that the 8-bit turbo decoder should be used
+  @returns 0 on success
+*/
+int ulsch_decoding_data(PHY_VARS_eNB *eNB,
+			int UE_id,
+			int harq_pid,
+			int llr8_flag);
+
 uint32_t ulsch_decoding_emul(PHY_VARS_eNB *phy_vars_eNB,
                              eNB_rxtx_proc_t *proc,
 			     uint8_t UE_index,
@@ -1755,6 +1816,9 @@ void compute_prach_seq(PRACH_CONFIG_COMMON *prach_config_common,
 
 void init_prach_tables(int N_ZC);
 
+void init_unscrambling_lut(void);
+void init_scrambling_lut(void);
+
 /*!
   \brief Return the status of MBSFN in this frame/subframe
   @param frame Frame index
diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c b/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
index 1d0c8c25e324c54d50c2ab9ec0a8114d9058a004..d194439f3b137b4f93a535a36149bee762adbbfa 100644
--- a/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
+++ b/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
@@ -217,20 +217,553 @@ uint8_t extract_cqi_crc(uint8_t *cqi,uint8_t CQI_LENGTH)
 }
 
 
-typedef struct {
-  PHY_VARS_eNB *eNB;
+
+
+
+
+int ulsch_decoding_data_2thread0(td_params* tdp) {
+
+  PHY_VARS_eNB *eNB = tdp->eNB;
+  int UE_id         = tdp->UE_id;
+  int harq_pid      = tdp->harq_pid;
+  int llr8_flag     = tdp->llr8_flag;
+
+  unsigned int r,r_offset=0,Kr,Kr_bytes,iind;
+  uint8_t crc_type;
+  int offset = 0;
+  int ret = 1;
+  int16_t dummy_w[MAX_NUM_ULSCH_SEGMENTS][3*(6144+64)];
+  LTE_eNB_ULSCH_t *ulsch = eNB->ulsch[UE_id];
+  LTE_UL_eNB_HARQ_t *ulsch_harq = ulsch->harq_processes[harq_pid];
+  int Q_m = get_Qm_ul(ulsch_harq->mcs);
+  int G = ulsch_harq->nb_rb * (12 * Q_m) * ulsch_harq->Nsymb_pusch;
+  uint32_t E;
+  uint32_t Gp,GpmodC,Nl=1;
+  uint32_t C = ulsch_harq->C;
+
+  uint8_t (*tc)(int16_t *y,
+                uint8_t *,
+                uint16_t,
+                uint16_t,
+                uint16_t,
+                uint8_t,
+                uint8_t,
+                uint8_t,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *);
+
+  if (llr8_flag == 0)
+    tc = phy_threegpplte_turbo_decoder16;
+  else
+    tc = phy_threegpplte_turbo_decoder8;
+
+
+
+  // go through first half of segments to get r_offset
+  for (r=0; r<(ulsch_harq->C/2); r++) {
+
+    // Get Turbo interleaver parameters
+    if (r<ulsch_harq->Cminus)
+      Kr = ulsch_harq->Kminus;
+    else
+      Kr = ulsch_harq->Kplus;
+
+    Kr_bytes = Kr>>3;
+
+    if (Kr_bytes<=64)
+      iind = (Kr_bytes-5);
+    else if (Kr_bytes <=128)
+      iind = 59 + ((Kr_bytes-64)>>1);
+    else if (Kr_bytes <= 256)
+      iind = 91 + ((Kr_bytes-128)>>2);
+    else if (Kr_bytes <= 768)
+      iind = 123 + ((Kr_bytes-256)>>3);
+    else {
+      LOG_E(PHY,"ulsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
+      return(-1);
+    }
+
+    // This is stolen from rate-matching algorithm to get the value of E
+    
+    Gp = G/Nl/Q_m;
+    GpmodC = Gp%C;
+    
+    if (r < (C-(GpmodC)))
+      E = Nl*Q_m * (Gp/C);
+    else
+      E = Nl*Q_m * ((GpmodC==0?0:1) + (Gp/C));
+    
+    r_offset += E;
+  }
+
+  // go through second half of segments
+  for (; r<(ulsch_harq->C/2); r++) {
+
+
+    //    printf("before subblock deinterleaving c[%d] = %p\n",r,ulsch_harq->c[r]);
+    // Get Turbo interleaver parameters
+    if (r<ulsch_harq->Cminus)
+      Kr = ulsch_harq->Kminus;
+    else
+      Kr = ulsch_harq->Kplus;
+
+    Kr_bytes = Kr>>3;
+
+    if (Kr_bytes<=64)
+      iind = (Kr_bytes-5);
+    else if (Kr_bytes <=128)
+      iind = 59 + ((Kr_bytes-64)>>1);
+    else if (Kr_bytes <= 256)
+      iind = 91 + ((Kr_bytes-128)>>2);
+    else if (Kr_bytes <= 768)
+      iind = 123 + ((Kr_bytes-256)>>3);
+    else {
+      LOG_E(PHY,"ulsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
+      return(-1);
+    }
+
+#ifdef DEBUG_ULSCH_DECODING
+    printf("f1 %d, f2 %d, F %d\n",f1f2mat_old[2*iind],f1f2mat_old[1+(2*iind)],(r==0) ? ulsch_harq->F : 0);
+#endif
+
+    memset(&dummy_w[r][0],0,3*(6144+64)*sizeof(short));
+    ulsch_harq->RTC[r] = generate_dummy_w(4+(Kr_bytes*8),
+                                          (uint8_t*)&dummy_w[r][0],
+                                          (r==0) ? ulsch_harq->F : 0);
+
+#ifdef DEBUG_ULSCH_DECODING
+    printf("Rate Matching Segment %d (coded bits (G) %d,unpunctured/repeated bits %d, Q_m %d, nb_rb %d, Nl %d)...\n",
+        r, G,
+        Kr*3,
+        Q_m,
+        nb_rb,
+        ulsch_harq->Nl);
+#endif
+
+
+    if (lte_rate_matching_turbo_rx(ulsch_harq->RTC[r],
+                                   G,
+                                   ulsch_harq->w[r],
+                                   (uint8_t*) &dummy_w[r][0],
+                                   ulsch_harq->e+r_offset,
+                                   ulsch_harq->C,
+                                   NSOFT,
+                                   0,   //Uplink
+                                   1,
+                                   ulsch_harq->rvidx,
+                                   (ulsch_harq->round==0)?1:0,  // clear
+                                   get_Qm_ul(ulsch_harq->mcs),
+                                   1,
+                                   r,
+                                   &E)==-1) {
+      LOG_E(PHY,"ulsch_decoding.c: Problem in rate matching\n");
+      return(-1);
+    }
+
+    r_offset += E;
+
+    sub_block_deinterleaving_turbo(4+Kr,
+                                   &ulsch_harq->d[r][96],
+                                   ulsch_harq->w[r]);
+
+    if (ulsch_harq->C == 1)
+      crc_type = CRC24_A;
+    else
+      crc_type = CRC24_B;
+    
+    
+    ret = tc(&ulsch_harq->d[r][96],
+	     ulsch_harq->c[r],
+	     Kr,
+	     f1f2mat_old[iind*2],
+	     f1f2mat_old[(iind*2)+1],
+	     ulsch->max_turbo_iterations,//MAX_TURBO_ITERATIONS,
+	     crc_type,
+	     (r==0) ? ulsch_harq->F : 0,
+	     &eNB->ulsch_tc_init_stats,
+	     &eNB->ulsch_tc_alpha_stats,
+	     &eNB->ulsch_tc_beta_stats,
+	     &eNB->ulsch_tc_gamma_stats,
+	     &eNB->ulsch_tc_ext_stats,
+	     &eNB->ulsch_tc_intl1_stats,
+	     &eNB->ulsch_tc_intl2_stats);
+    
+    
+  // Reassembly of Transport block here
+
+    if (ret != (1+ulsch->max_turbo_iterations)) {
+      if (r<ulsch_harq->Cminus)
+	Kr = ulsch_harq->Kminus;
+      else
+	Kr = ulsch_harq->Kplus;
+      
+      Kr_bytes = Kr>>3;
+      
+      if (r==0) {
+	memcpy(ulsch_harq->b,
+	       &ulsch_harq->c[0][(ulsch_harq->F>>3)],
+	       Kr_bytes - (ulsch_harq->F>>3) - ((ulsch_harq->C>1)?3:0));
+	offset = Kr_bytes - (ulsch_harq->F>>3) - ((ulsch_harq->C>1)?3:0);
+      } else {
+	memcpy(ulsch_harq->b+offset,
+	       ulsch_harq->c[r],
+	       Kr_bytes - ((ulsch_harq->C>1)?3:0));
+	offset += (Kr_bytes- ((ulsch_harq->C>1)?3:0));
+      }
+      
+    } else {
+      break;
+    }
+    
+  }
+
+  return(ret);
+}
+
+extern int oai_exit;
+void *td_thread(void *param) {
+  PHY_VARS_eNB *eNB = ((td_params*)param)->eNB;
+  eNB_proc_t *proc  = &eNB->proc;
+
+  while (!oai_exit) {
+
+    if (wait_on_condition(&proc->mutex_td,&proc->cond_td,&proc->instance_cnt_td,"td thread")<0) break;  
+
+    ((td_params*)param)->ret = ulsch_decoding_data_2thread0((td_params*)param);
+
+    if (release_thread(&proc->mutex_td,&proc->instance_cnt_td,"td thread")<0) break;
+
+    if (pthread_cond_signal(&proc->cond_td) != 0) {
+      printf("[eNB] ERROR pthread_cond_signal for td thread exit\n");
+      exit_fun( "ERROR pthread_cond_signal" );
+      return(NULL);
+    }
+  }
+
+  return(NULL);
+}
+
+int ulsch_decoding_data_2thread(PHY_VARS_eNB *eNB,int UE_id,int harq_pid,int llr8_flag) {
+
+  eNB_proc_t *proc = &eNB->proc;
+  unsigned int r,r_offset=0,Kr,Kr_bytes,iind;
+  uint8_t crc_type;
+  int offset = 0;
+  int ret = 1;
+  int16_t dummy_w[MAX_NUM_ULSCH_SEGMENTS][3*(6144+64)];
+  LTE_eNB_ULSCH_t *ulsch = eNB->ulsch[UE_id];
+  LTE_UL_eNB_HARQ_t *ulsch_harq = ulsch->harq_processes[harq_pid];
+  int Q_m = get_Qm_ul(ulsch_harq->mcs);
+  int G = ulsch_harq->nb_rb * (12 * Q_m) * ulsch_harq->Nsymb_pusch;
+  unsigned int E;
+
+  uint8_t (*tc)(int16_t *y,
+                uint8_t *,
+                uint16_t,
+                uint16_t,
+                uint16_t,
+                uint8_t,
+                uint8_t,
+                uint8_t,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *);
+
+  struct timespec wait;
+
+  wait.tv_sec=0;
+  wait.tv_nsec=5000000L;
+
+
+  if (llr8_flag == 0)
+    tc = phy_threegpplte_turbo_decoder16;
+  else
+    tc = phy_threegpplte_turbo_decoder8;
+
+  if (pthread_mutex_timedlock(&proc->mutex_td,&wait) != 0) {
+    printf("[eNB] ERROR pthread_mutex_lock for TD thread %d (IC %d)\n", proc->instance_cnt_td);
+    exit_fun( "error locking mutex_fep" );
+    return;
+  }
+
+  if (proc->instance_cnt_td==0) {
+    printf("[eNB] TD thread busy\n");
+    exit_fun("TD thread busy");
+    pthread_mutex_unlock( &proc->mutex_td );
+    return;
+  }
   
-} tc_param;
+  ++proc->instance_cnt_te;
+
+  proc->tdp.eNB       = eNB;
+  proc->tdp.UE_id     = UE_id;
+  proc->tdp.harq_pid  = harq_pid;
+  proc->tdp.llr8_flag = llr8_flag;
+
+  // wakeup worker to do second half segments 
+  if (pthread_cond_signal(&proc->cond_td) != 0) {
+    printf("[eNB] ERROR pthread_cond_signal for td thread exit\n");
+    exit_fun( "ERROR pthread_cond_signal" );
+    return (1+ulsch->max_turbo_iterations);
+  }
+
+  pthread_mutex_unlock( &proc->mutex_td );
+
+
+  // go through first half of segments in main thread
+  for (r=0; r<(ulsch_harq->C/2); r++) {
+
+    //    printf("before subblock deinterleaving c[%d] = %p\n",r,ulsch_harq->c[r]);
+    // Get Turbo interleaver parameters
+    if (r<ulsch_harq->Cminus)
+      Kr = ulsch_harq->Kminus;
+    else
+      Kr = ulsch_harq->Kplus;
+
+    Kr_bytes = Kr>>3;
+
+    if (Kr_bytes<=64)
+      iind = (Kr_bytes-5);
+    else if (Kr_bytes <=128)
+      iind = 59 + ((Kr_bytes-64)>>1);
+    else if (Kr_bytes <= 256)
+      iind = 91 + ((Kr_bytes-128)>>2);
+    else if (Kr_bytes <= 768)
+      iind = 123 + ((Kr_bytes-256)>>3);
+    else {
+      LOG_E(PHY,"ulsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
+      return(-1);
+    }
+
+#ifdef DEBUG_ULSCH_DECODING
+    printf("f1 %d, f2 %d, F %d\n",f1f2mat_old[2*iind],f1f2mat_old[1+(2*iind)],(r==0) ? ulsch_harq->F : 0);
+#endif
+
+    memset(&dummy_w[r][0],0,3*(6144+64)*sizeof(short));
+    ulsch_harq->RTC[r] = generate_dummy_w(4+(Kr_bytes*8),
+                                          (uint8_t*)&dummy_w[r][0],
+                                          (r==0) ? ulsch_harq->F : 0);
+
+#ifdef DEBUG_ULSCH_DECODING
+    printf("Rate Matching Segment %d (coded bits (G) %d,unpunctured/repeated bits %d, Q_m %d, nb_rb %d, Nl %d)...\n",
+        r, G,
+        Kr*3,
+        Q_m,
+        nb_rb,
+        ulsch_harq->Nl);
+#endif
+
+    start_meas(&eNB->ulsch_rate_unmatching_stats);
+
+    if (lte_rate_matching_turbo_rx(ulsch_harq->RTC[r],
+                                   G,
+                                   ulsch_harq->w[r],
+                                   (uint8_t*) &dummy_w[r][0],
+                                   ulsch_harq->e+r_offset,
+                                   ulsch_harq->C,
+                                   NSOFT,
+                                   0,   //Uplink
+                                   1,
+                                   ulsch_harq->rvidx,
+                                   (ulsch_harq->round==0)?1:0,  // clear
+                                   get_Qm_ul(ulsch_harq->mcs),
+                                   1,
+                                   r,
+                                   &E)==-1) {
+      LOG_E(PHY,"ulsch_decoding.c: Problem in rate matching\n");
+      return(-1);
+    }
+
+    stop_meas(&eNB->ulsch_rate_unmatching_stats);
+    r_offset += E;
+
+    start_meas(&eNB->ulsch_deinterleaving_stats);
+    sub_block_deinterleaving_turbo(4+Kr,
+                                   &ulsch_harq->d[r][96],
+                                   ulsch_harq->w[r]);
+    stop_meas(&eNB->ulsch_deinterleaving_stats);
+
+    if (ulsch_harq->C == 1)
+      crc_type = CRC24_A;
+    else
+      crc_type = CRC24_B;
+
+    start_meas(&eNB->ulsch_turbo_decoding_stats);
+    
+    ret = tc(&ulsch_harq->d[r][96],
+	     ulsch_harq->c[r],
+	     Kr,
+	     f1f2mat_old[iind*2],
+	     f1f2mat_old[(iind*2)+1],
+	     ulsch->max_turbo_iterations,//MAX_TURBO_ITERATIONS,
+	     crc_type,
+	     (r==0) ? ulsch_harq->F : 0,
+	     &eNB->ulsch_tc_init_stats,
+	     &eNB->ulsch_tc_alpha_stats,
+	     &eNB->ulsch_tc_beta_stats,
+	     &eNB->ulsch_tc_gamma_stats,
+	     &eNB->ulsch_tc_ext_stats,
+	     &eNB->ulsch_tc_intl1_stats,
+	     &eNB->ulsch_tc_intl2_stats);
+    
+  // Reassembly of Transport block here
+
+    if (ret != (1+ulsch->max_turbo_iterations)) {
+      if (r<ulsch_harq->Cminus)
+	Kr = ulsch_harq->Kminus;
+      else
+	Kr = ulsch_harq->Kplus;
+      
+      Kr_bytes = Kr>>3;
+      
+      if (r==0) {
+	memcpy(ulsch_harq->b,
+	       &ulsch_harq->c[0][(ulsch_harq->F>>3)],
+	       Kr_bytes - (ulsch_harq->F>>3) - ((ulsch_harq->C>1)?3:0));
+	offset = Kr_bytes - (ulsch_harq->F>>3) - ((ulsch_harq->C>1)?3:0);
+      } else {
+	memcpy(ulsch_harq->b+offset,
+	       ulsch_harq->c[r],
+	       Kr_bytes - ((ulsch_harq->C>1)?3:0));
+	offset += (Kr_bytes- ((ulsch_harq->C>1)?3:0));
+      }
+      
+    } else {
+      break;
+    }
+    stop_meas(&eNB->ulsch_turbo_decoding_stats);    
+  }
+
+   // wait for worker to finish
+
+  wait_on_busy_condition(&proc->mutex_td,&proc->cond_td,&proc->instance_cnt_td,"td thread");  
+
+  return( (ret>proc->tdp.ret) ? ret : proc->tdp.ret );
+}
+
+int ulsch_decoding_data(PHY_VARS_eNB *eNB,int UE_id,int harq_pid,int llr8_flag) {
+
+  unsigned int r,r_offset=0,Kr,Kr_bytes,iind;
+  uint8_t crc_type;
+  int offset = 0;
+  int ret = 1;
+  int16_t dummy_w[MAX_NUM_ULSCH_SEGMENTS][3*(6144+64)];
+  LTE_eNB_ULSCH_t *ulsch = eNB->ulsch[UE_id];
+  LTE_UL_eNB_HARQ_t *ulsch_harq = ulsch->harq_processes[harq_pid];
+  int Q_m = get_Qm_ul(ulsch_harq->mcs);
+  int G = ulsch_harq->nb_rb * (12 * Q_m) * ulsch_harq->Nsymb_pusch;
+  unsigned int E;
+
+  uint8_t (*tc)(int16_t *y,
+                uint8_t *,
+                uint16_t,
+                uint16_t,
+                uint16_t,
+                uint8_t,
+                uint8_t,
+                uint8_t,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *,
+                time_stats_t *);
+
+  if (llr8_flag == 0)
+    tc = phy_threegpplte_turbo_decoder16;
+  else
+    tc = phy_threegpplte_turbo_decoder8;
+
+
+  for (r=0; r<ulsch_harq->C; r++) {
+
+    //    printf("before subblock deinterleaving c[%d] = %p\n",r,ulsch_harq->c[r]);
+    // Get Turbo interleaver parameters
+    if (r<ulsch_harq->Cminus)
+      Kr = ulsch_harq->Kminus;
+    else
+      Kr = ulsch_harq->Kplus;
+
+    Kr_bytes = Kr>>3;
+
+    if (Kr_bytes<=64)
+      iind = (Kr_bytes-5);
+    else if (Kr_bytes <=128)
+      iind = 59 + ((Kr_bytes-64)>>1);
+    else if (Kr_bytes <= 256)
+      iind = 91 + ((Kr_bytes-128)>>2);
+    else if (Kr_bytes <= 768)
+      iind = 123 + ((Kr_bytes-256)>>3);
+    else {
+      LOG_E(PHY,"ulsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
+      return(-1);
+    }
+
+#ifdef DEBUG_ULSCH_DECODING
+    printf("f1 %d, f2 %d, F %d\n",f1f2mat_old[2*iind],f1f2mat_old[1+(2*iind)],(r==0) ? ulsch_harq->F : 0);
+#endif
+
+    memset(&dummy_w[r][0],0,3*(6144+64)*sizeof(short));
+    ulsch_harq->RTC[r] = generate_dummy_w(4+(Kr_bytes*8),
+                                          (uint8_t*)&dummy_w[r][0],
+                                          (r==0) ? ulsch_harq->F : 0);
+
+#ifdef DEBUG_ULSCH_DECODING
+    printf("Rate Matching Segment %d (coded bits (G) %d,unpunctured/repeated bits %d, Q_m %d, nb_rb %d, Nl %d)...\n",
+        r, G,
+        Kr*3,
+        Q_m,
+        nb_rb,
+        ulsch_harq->Nl);
+#endif
+
+    start_meas(&eNB->ulsch_rate_unmatching_stats);
 
-static void *td_thread(void *param) {
+    if (lte_rate_matching_turbo_rx(ulsch_harq->RTC[r],
+                                   G,
+                                   ulsch_harq->w[r],
+                                   (uint8_t*) &dummy_w[r][0],
+                                   ulsch_harq->e+r_offset,
+                                   ulsch_harq->C,
+                                   NSOFT,
+                                   0,   //Uplink
+                                   1,
+                                   ulsch_harq->rvidx,
+                                   (ulsch_harq->round==0)?1:0,  // clear
+                                   get_Qm_ul(ulsch_harq->mcs),
+                                   1,
+                                   r,
+                                   &E)==-1) {
+      LOG_E(PHY,"ulsch_decoding.c: Problem in rate matching\n");
+      return(-1);
+    }
 
-  PHY_VARS_eNB *eNB = (tc_param*)param->eNB;
-  eNB_proc_t *proc  = &eNB->proc;
+    stop_meas(&eNB->ulsch_rate_unmatching_stats);
+    r_offset += E;
 
-  while (!oai_exit) {
+    start_meas(&eNB->ulsch_deinterleaving_stats);
+    sub_block_deinterleaving_turbo(4+Kr,
+                                   &ulsch_harq->d[r][96],
+                                   ulsch_harq->w[r]);
+    stop_meas(&eNB->ulsch_deinterleaving_stats);
 
-    if (wait_on_condition(&proc->mutex_td,&proc->cond_td,&proc->instance_cnt_td,"td thread")<0) break;  
-    // TD here
+    if (ulsch_harq->C == 1)
+      crc_type = CRC24_A;
+    else
+      crc_type = CRC24_B;
+   
+    start_meas(&eNB->ulsch_turbo_decoding_stats);
+    
     ret = tc(&ulsch_harq->d[r][96],
 	     ulsch_harq->c[r],
 	     Kr,
@@ -249,20 +782,63 @@ static void *td_thread(void *param) {
     
     stop_meas(&eNB->ulsch_turbo_decoding_stats);
     
-    status[r] = ret;
-
-    if (release_thread(&proc->mutex_td,&proc->instance_cnt_td,"td thread")<0) break;
+  // Reassembly of Transport block here
 
-    if (pthread_cond_signal(&proc->cond_td) != 0) {
-      printf("[eNB] ERROR pthread_cond_signal for td thread exit\n");
-      exit_fun( "ERROR pthread_cond_signal" );
-      return;
+    if (ret != (1+ulsch->max_turbo_iterations)) {
+      if (r<ulsch_harq->Cminus)
+	Kr = ulsch_harq->Kminus;
+      else
+	Kr = ulsch_harq->Kplus;
+      
+      Kr_bytes = Kr>>3;
+      
+      if (r==0) {
+	memcpy(ulsch_harq->b,
+	       &ulsch_harq->c[0][(ulsch_harq->F>>3)],
+	       Kr_bytes - (ulsch_harq->F>>3) - ((ulsch_harq->C>1)?3:0));
+	offset = Kr_bytes - (ulsch_harq->F>>3) - ((ulsch_harq->C>1)?3:0);
+      } else {
+	memcpy(ulsch_harq->b+offset,
+	       ulsch_harq->c[r],
+	       Kr_bytes - ((ulsch_harq->C>1)?3:0));
+	offset += (Kr_bytes- ((ulsch_harq->C>1)?3:0));
+      }
+      
+    } else {
+      break;
     }
+    
   }
 
+  return(ret);
+}
+
+static inline unsigned int lte_gold_unscram(unsigned int *x1, unsigned int *x2, unsigned char reset) __attribute__((always_inline));
+static inline unsigned int lte_gold_unscram(unsigned int *x1, unsigned int *x2, unsigned char reset)
+{
+  int n;
+
+  if (reset) {
+    *x1 = 1+ (1<<31);
+    *x2=*x2 ^ ((*x2 ^ (*x2>>1) ^ (*x2>>2) ^ (*x2>>3))<<31);
+
+    // skip first 50 double words (1600 bits)
+    //      printf("n=0 : x1 %x, x2 %x\n",x1,x2);
+    for (n=1; n<50; n++) {
+      *x1 = (*x1>>1) ^ (*x1>>4);
+      *x1 = *x1 ^ (*x1<<31) ^ (*x1<<28);
+      *x2 = (*x2>>1) ^ (*x2>>2) ^ (*x2>>3) ^ (*x2>>4);
+      *x2 = *x2 ^ (*x2<<31) ^ (*x2<<30) ^ (*x2<<29) ^ (*x2<<28);
+    }
+  }
 
+  *x1 = (*x1>>1) ^ (*x1>>4);
+  *x1 = *x1 ^ (*x1<<31) ^ (*x1<<28);
+  *x2 = (*x2>>1) ^ (*x2>>2) ^ (*x2>>3) ^ (*x2>>4);
+  *x2 = *x2 ^ (*x2<<31) ^ (*x2<<30) ^ (*x2<<29) ^ (*x2<<28);
+  return(*x1^*x2);
+  //  printf("n=%d : c %x\n",n,x1^x2);
 
-  return(NULL);
 }
   
 unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
@@ -278,16 +854,15 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
   LTE_eNB_ULSCH_t *ulsch = eNB->ulsch[UE_id];
   uint8_t harq_pid;
   unsigned short nb_rb;
-  unsigned int A,E;
+  unsigned int A;
   uint8_t Q_m;
   unsigned int i,i2,q,j,j2;
   int iprime;
-  unsigned int ret=0,offset;
-  unsigned short iind;
+  unsigned int ret=0;
+
   //  uint8_t dummy_channel_output[(3*8*block_length)+12];
+  int r,Kr;
 
-  unsigned int r,r_offset=0,Kr,Kr_bytes;
-  uint8_t crc_type;
   uint8_t *columnset;
   unsigned int sumKr=0;
   unsigned int Qprime,L,G,Q_CQI,Q_RI,H,Hprime,Hpp,Cmux,Rmux_prime,O_RCC;
@@ -298,32 +873,17 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
   uint32_t x1, x2, s=0;
   int16_t ys,c;
   uint32_t wACK_idx;
-  int16_t dummy_w[MAX_NUM_ULSCH_SEGMENTS][3*(6144+64)];
   uint8_t dummy_w_cc[3*(MAX_CQI_BITS+8+32)];
   int16_t y[6*14*1200];
   uint8_t ytag[14*1200];
   //  uint8_t ytag2[6*14*1200],*ytag2_ptr;
   int16_t cseq[6*14*1200];
   int off;
-  int status[20];
+
   int subframe = proc->subframe_rx;
   LTE_UL_eNB_HARQ_t *ulsch_harq;
 
-  uint8_t (*tc)(int16_t *y,
-                uint8_t *,
-                uint16_t,
-                uint16_t,
-                uint16_t,
-                uint8_t,
-                uint8_t,
-                uint8_t,
-                time_stats_t *,
-                time_stats_t *,
-                time_stats_t *,
-                time_stats_t *,
-                time_stats_t *,
-                time_stats_t *,
-                time_stats_t *);
+
 
   harq_pid = subframe2harq_pid(frame_parms,proc->frame_rx,subframe);
 
@@ -344,10 +904,7 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
       VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_ENB_ULSCH_DECODING0+harq_pid,0); 
       return 1+ulsch->max_turbo_iterations;
   }
-  if (llr8_flag == 0)
-    tc = phy_threegpplte_turbo_decoder16;
-  else
-    tc = phy_threegpplte_turbo_decoder8;
+
 
   nb_rb = ulsch_harq->nb_rb;
 
@@ -490,38 +1047,57 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
   H = G + Q_CQI;
   Hprime = H/Q_m;
 
+
   // Demultiplexing/Deinterleaving of PUSCH/ACK/RI/CQI
+  start_meas(&eNB->ulsch_demultiplexing_stats);
   Hpp = Hprime + Qprime_RI;
 
   Cmux       = ulsch_harq->Nsymb_pusch;
-  //  Rmux       = Hpp*Q_m/Cmux;
   Rmux_prime = Hpp/Cmux;
 
-
   // Clear "tag" interleaving matrix to allow for CQI/DATA identification
   memset(ytag,0,Cmux*Rmux_prime);
 
-  start_meas(&eNB->ulsch_demultiplexing_stats);
+
 
   i=0;
   memset(y,LTE_NULL,Q_m*Hpp);
 
-  //  printf("before unscrambling c[%d] = %p\n",0,ulsch_harq->c[0]);
   // read in buffer and unscramble llrs for everything but placeholder bits
   // llrs stored per symbol correspond to columns of interleaving matrix
 
 
-  s = lte_gold_generic(&x1, &x2, 1);
+  s = lte_gold_unscram(&x1, &x2, 1);
   i2=0;
 
   for (i=0; i<((Hpp*Q_m)>>5); i++) {
+    /*
     for (j=0; j<32; j++) {
       cseq[i2++] = (int16_t)((((s>>j)&1)<<1)-1);
     }
-
-    s = lte_gold_generic(&x1, &x2, 0);
+    */
+#if defined(__x86_64__) || defined(__i386__)
+#ifndef __AVX2__
+    ((__m128i*)cseq)[i2++] = ((__m128i*)unscrambling_lut)[(s&65535)<<1];
+    ((__m128i*)cseq)[i2++] = ((__m128i*)unscrambling_lut)[1+((s&65535)<<1)];
+    s>>=16;
+    ((__m128i*)cseq)[i2++] = ((__m128i*)unscrambling_lut)[(s&65535)<<1];
+    ((__m128i*)cseq)[i2++] = ((__m128i*)unscrambling_lut)[1+((s&65535)<<1)];
+#else
+    ((__m256i*)cseq)[i2++] = ((__m256i*)unscrambling_lut)[s&65535];
+    ((__m256i*)cseq)[i2++] = ((__m256i*)unscrambling_lut)[(s>>16)&65535];
+#endif
+#elif defined(__arm__)
+    ((int16x8_t*)cseq)[i2++] = ((int16x8_t*)unscrambling_lut)[(s&65535)<<1];
+    ((int16x8_t*)cseq)[i2++] = ((int16x8_t*)unscrambling_lut)[1+((s&65535)<<1)];
+    s>>=16;
+    ((int16x8_t*)cseq)[i2++] = ((int16x8_t*)unscrambling_lut)[(s&65535)<<1];
+    ((int16x8_t*)cseq)[i2++] = ((int16x8_t*)unscrambling_lut)[1+((s&65535)<<1)];
+#endif
+    s = lte_gold_unscram(&x1, &x2, 0);
   }
 
+
   //  printf("after unscrambling c[%d] = %p\n",0,ulsch_harq->c[0]);
 
   if (frame_parms->Ncp == 0)
@@ -559,11 +1135,6 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
 
   for (i=0; i<Qprime_ACK; i++) {
     r = Rmux_prime - 1 - (i>>2);
-    /*
-    for (q=0;q<Q_m;q++) {
-      ytag2[q+(Q_m*((r*Cmux) + columnset[j]))]  = q_ACK[(q+(Q_m*i))%len_ACK];
-    }
-    */
     off =((Rmux_prime*Q_m*columnset[j])+(r*Q_m));
 
     if (ulsch_harq->O_ACK == 1) {
@@ -612,6 +1183,7 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
       i2=j<<2;
 
       for (r=0; r<Rmux_prime; r++) {
+	/*
         c = cseq[i];
         y[i2++] = c*ulsch_llr[i++];
         c = cseq[i];
@@ -621,6 +1193,10 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
         c = cseq[i];
         y[i2] = c*ulsch_llr[i++];
         i2=(i2+(Cmux<<2)-3);
+	*/
+	*(__m64 *)&y[i2] = _mm_sign_pi16(*(__m64*)&ulsch_llr[i],*(__m64*)&cseq[i]);i+=4;i2+=(Cmux<<2);
+
+
       }
     }
 
@@ -651,7 +1227,7 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
   }
 
 
-  stop_meas(&eNB->ulsch_demultiplexing_stats);
+
 
   if (i!=(H+Q_RI))
     LOG_D(PHY,"ulsch_decoding.c: Error in input buffer length (j %d, H+Q_RI %d)\n",i,H+Q_RI);
@@ -864,34 +1440,23 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
       
       j2+=Q_m;
     }
-    //    printf("after CQI0 c[%d] = %p\n",0,ulsch_harq->c[0]);
-    switch (Q_m) {
-    case 2:
-      for (iprime=0; iprime<G;) {
-	ulsch_harq->e[iprime++] = y[j2++];
-	ulsch_harq->e[iprime++] = y[j2++];
-      }
-      break;
-    case 4:
-      for (iprime=0; iprime<G;) {
-	ulsch_harq->e[iprime++] = y[j2++];
-	ulsch_harq->e[iprime++] = y[j2++];
-	ulsch_harq->e[iprime++] = y[j2++];
-	ulsch_harq->e[iprime++] = y[j2++];
-      }
-      break;
-    case 6:
-      for (iprime=0; iprime<G;) {
-	ulsch_harq->e[iprime++] = y[j2++];
-	ulsch_harq->e[iprime++] = y[j2++];
-	ulsch_harq->e[iprime++] = y[j2++];
-	ulsch_harq->e[iprime++] = y[j2++];
-	ulsch_harq->e[iprime++] = y[j2++];
-	ulsch_harq->e[iprime++] = y[j2++];
-      }
-      break;
-    }
+
+#if defined(__x86_64__)||defined(__i386__)
+#ifndef __AVX2
+    for (iprime=0; iprime<G;iprime+=8,j2+=8)
+      *((__m128i *)&ulsch_harq->e[iprime]) = *((__m128i *)&y[j2]);
+#else
+    for (iprime=0; iprime<G;iprime+=16,j2+=16)
+      *((__m256i *)&ulsch_harq->e[iprime]) = *((__m256i *)&y[j2]);
+#endif
+#elif defined(__arm__)
+    for (iprime=0; iprime<G;iprime+=8,j2+=8)
+      *((int16x8_t *)&ulsch_harq->e[iprime]) = *((int16x8_t *)&y[j2]);
+#endif 
   }
+
+  stop_meas(&eNB->ulsch_demultiplexing_stats);
+
   //  printf("after ACKNAK2 c[%d] = %p (iprime %d, G %d)\n",0,ulsch_harq->c[0],iprime,G);
 
   // Do CQI/RI/HARQ-ACK Decoding first and pass to MAC
@@ -1024,156 +1589,10 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
 #endif
   }
 
-  //  return(0);
-  // Do PUSCH Decoding
-
-  //  stop_meas(&eNB->ulsch_demultiplexing_stats);
-
-
-  r_offset = 0;
-
-  for (r=0; r<ulsch_harq->C; r++) {
-
-    //    printf("before subblock deinterleaving c[%d] = %p\n",r,ulsch_harq->c[r]);
-    // Get Turbo interleaver parameters
-    if (r<ulsch_harq->Cminus)
-      Kr = ulsch_harq->Kminus;
-    else
-      Kr = ulsch_harq->Kplus;
-
-    Kr_bytes = Kr>>3;
-
-    if (Kr_bytes<=64)
-      iind = (Kr_bytes-5);
-    else if (Kr_bytes <=128)
-      iind = 59 + ((Kr_bytes-64)>>1);
-    else if (Kr_bytes <= 256)
-      iind = 91 + ((Kr_bytes-128)>>2);
-    else if (Kr_bytes <= 768)
-      iind = 123 + ((Kr_bytes-256)>>3);
-    else {
-      LOG_E(PHY,"ulsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
-      return(-1);
-    }
-
-#ifdef DEBUG_ULSCH_DECODING
-    printf("f1 %d, f2 %d, F %d\n",f1f2mat_old[2*iind],f1f2mat_old[1+(2*iind)],(r==0) ? ulsch_harq->F : 0);
-#endif
-
-    memset(&dummy_w[r][0],0,3*(6144+64)*sizeof(short));
-    ulsch_harq->RTC[r] = generate_dummy_w(4+(Kr_bytes*8),
-                                          (uint8_t*)&dummy_w[r][0],
-                                          (r==0) ? ulsch_harq->F : 0);
-
-#ifdef DEBUG_ULSCH_DECODING
-    printf("Rate Matching Segment %d (coded bits (G) %d,unpunctured/repeated bits %d, Q_m %d, nb_rb %d, Nl %d)...\n",
-        r, G,
-        Kr*3,
-        Q_m,
-        nb_rb,
-        ulsch_harq->Nl);
-#endif
-
-    start_meas(&eNB->ulsch_rate_unmatching_stats);
-
-    if (lte_rate_matching_turbo_rx(ulsch_harq->RTC[r],
-                                   G,
-                                   ulsch_harq->w[r],
-                                   (uint8_t*) &dummy_w[r][0],
-                                   ulsch_harq->e+r_offset,
-                                   ulsch_harq->C,
-                                   NSOFT,
-                                   0,   //Uplink
-                                   1,
-                                   ulsch_harq->rvidx,
-                                   (ulsch_harq->round==0)?1:0,  // clear
-                                   get_Qm_ul(ulsch_harq->mcs),
-                                   1,
-                                   r,
-                                   &E)==-1) {
-      LOG_E(PHY,"ulsch_decoding.c: Problem in rate matching\n");
-      return(-1);
-    }
-
-    stop_meas(&eNB->ulsch_rate_unmatching_stats);
-    r_offset += E;
-
-    start_meas(&eNB->ulsch_deinterleaving_stats);
-    sub_block_deinterleaving_turbo(4+Kr,
-                                   &ulsch_harq->d[r][96],
-                                   ulsch_harq->w[r]);
-    stop_meas(&eNB->ulsch_deinterleaving_stats);
-  }
-
-  for (r=0; r<ulsch_harq->C; r+=2) {
-    
-    /*      printf("c[%d] : %p\n",r,
-	    ulsch_harq->c[r]);
-    */
-    
-    if (ulsch_harq->C == 1)
-      crc_type = CRC24_A;
-    else
-      crc_type = CRC24_B;
-    
-    start_meas(&eNB->ulsch_turbo_decoding_stats);
-    
-    ret = tc(&ulsch_harq->d[r][96],
-	     ulsch_harq->c[r],
-	     Kr,
-	     f1f2mat_old[iind*2],
-	     f1f2mat_old[(iind*2)+1],
-	     ulsch->max_turbo_iterations,//MAX_TURBO_ITERATIONS,
-	     crc_type,
-	     (r==0) ? ulsch_harq->F : 0,
-	     &eNB->ulsch_tc_init_stats,
-	     &eNB->ulsch_tc_alpha_stats,
-	     &eNB->ulsch_tc_beta_stats,
-	     &eNB->ulsch_tc_gamma_stats,
-	     &eNB->ulsch_tc_ext_stats,
-	     &eNB->ulsch_tc_intl1_stats,
-	     &eNB->ulsch_tc_intl2_stats);
-    
-    stop_meas(&eNB->ulsch_turbo_decoding_stats);
-    
-    status[r] = ret;
-    if (ret==(1+ulsch->max_turbo_iterations))
-	break;
-  }
-
-  // Reassembly of Transport block here
-  offset = 0;
-
-  ret = 1;
-
-  for (r=0; r<ulsch_harq->C; r++) {
-    if (status[r] != (1+ulsch->max_turbo_iterations)) {
-      if (r<ulsch_harq->Cminus)
-        Kr = ulsch_harq->Kminus;
-      else
-        Kr = ulsch_harq->Kplus;
-
-      Kr_bytes = Kr>>3;
-
-      if (r==0) {
-        memcpy(ulsch_harq->b,
-               &ulsch_harq->c[0][(ulsch_harq->F>>3)],
-               Kr_bytes - (ulsch_harq->F>>3) - ((ulsch_harq->C>1)?3:0));
-        offset = Kr_bytes - (ulsch_harq->F>>3) - ((ulsch_harq->C>1)?3:0);
-      } else {
-        memcpy(ulsch_harq->b+offset,
-               ulsch_harq->c[r],
-               Kr_bytes - ((ulsch_harq->C>1)?3:0));
-        offset += (Kr_bytes- ((ulsch_harq->C>1)?3:0));
-      }
 
-      if (ret != (1+ulsch->max_turbo_iterations))
-        ret = status[r];
-    } else {
-      ret = 1+ulsch->max_turbo_iterations;
-    }
+  // Do ULSCH Decoding for data portion
 
-  }
+  ret = eNB->td(eNB,UE_id,harq_pid,llr8_flag);
 
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_ENB_ULSCH_DECODING0+harq_pid,0);
 
diff --git a/openair1/PHY/defs.h b/openair1/PHY/defs.h
index abf6038543c51289b7ba0b9d62e5899aa644d2bb..7f73f98bd7285662fb1872fa3c39c94dd3064c87 100755
--- a/openair1/PHY/defs.h
+++ b/openair1/PHY/defs.h
@@ -211,6 +211,21 @@ typedef struct {
   /// scheduling parameters for RXn-TXnp4 thread
   struct sched_param sched_param_rxtx;
 } eNB_rxtx_proc_t;
+
+typedef struct {
+  struct PHY_VARS_eNB_s *eNB;
+  int UE_id;
+  int harq_pid;
+  int llr8_flag;
+  int ret;
+} td_params;
+
+typedef struct {
+  struct PHY_VARS_eNB_s *eNB;
+  LTE_eNB_DLSCH_t *dlsch;
+  int G;
+} te_params;
+
 /// Context data structure for eNB subframe processing
 typedef struct eNB_proc_t_s {
   /// Component Carrier index
@@ -229,6 +244,10 @@ typedef struct eNB_proc_t_s {
   int frame_prach;
   /// \internal This variable is protected by \ref mutex_fep.
   int instance_cnt_fep;
+  /// \internal This variable is protected by \ref mutex_td.
+  int instance_cnt_td;
+  /// \internal This variable is protected by \ref mutex_te.
+  int instance_cnt_te;
   /// \brief Instance count for FH processing thread.
   /// \internal This variable is protected by \ref mutex_FH.
   int instance_cnt_FH;
@@ -249,6 +268,10 @@ typedef struct eNB_proc_t_s {
   int first_tx;
   /// pthread attributes for parallel fep thread
   pthread_attr_t attr_fep;
+  /// pthread attributes for parallel turbo-decoder thread
+  pthread_attr_t attr_td;
+  /// pthread attributes for parallel turbo-encoder thread
+  pthread_attr_t attr_te;
   /// pthread attributes for FH processing thread
   pthread_attr_t attr_FH;
   /// pthread attributes for single eNB processing thread
@@ -259,6 +282,10 @@ typedef struct eNB_proc_t_s {
   pthread_attr_t attr_asynch_rxtx;
   /// scheduling parameters for parallel fep thread
   struct sched_param sched_param_fep;
+  /// scheduling parameters for parallel turbo-decoder thread
+  struct sched_param sched_param_td;
+  /// scheduling parameters for parallel turbo-encoder thread
+  struct sched_param sched_param_te;
   /// scheduling parameters for FH thread
   struct sched_param sched_param_FH;
   /// scheduling parameters for single eNB thread
@@ -269,10 +296,18 @@ typedef struct eNB_proc_t_s {
   struct sched_param sched_param_asynch_rxtx;
   /// pthread structure for parallel fep thread
   pthread_t pthread_fep;
+  /// pthread structure for parallel turbo-decoder thread
+  pthread_t pthread_td;
+  /// pthread structure for parallel turbo-encoder thread
+  pthread_t pthread_te;
   /// pthread structure for PRACH thread
   pthread_t pthread_prach;
   /// condition variable for parallel fep thread
   pthread_cond_t cond_fep;
+  /// condition variable for parallel turbo-decoder thread
+  pthread_cond_t cond_td;
+  /// condition variable for parallel turbo-encoder thread
+  pthread_cond_t cond_te;
   /// condition variable for FH thread
   pthread_cond_t cond_FH;
   /// condition variable for PRACH processing thread;
@@ -281,12 +316,20 @@ typedef struct eNB_proc_t_s {
   pthread_cond_t cond_asynch_rxtx;
   /// mutex for parallel fep thread
   pthread_mutex_t mutex_fep;
+  /// mutex for parallel turbo-decoder thread
+  pthread_mutex_t mutex_td;
+  /// mutex for parallel turbo-encoder thread
+  pthread_mutex_t mutex_te;
   /// mutex for FH
   pthread_mutex_t mutex_FH;
   /// mutex for PRACH thread
   pthread_mutex_t mutex_prach;
   /// mutex for asynch RX/TX thread
   pthread_mutex_t mutex_asynch_rxtx;
+  /// parameters for turbo-decoding worker thread
+  td_params tdp;
+  /// parameters for turbo-encoding worker thread
+  te_params tep;
   /// set of scheduling variables RXn-TXnp4 threads
   eNB_rxtx_proc_t proc_rxtx[2];
   /// number of slave threads
@@ -367,6 +410,8 @@ typedef struct PHY_VARS_eNB_s {
   int                  abstraction_flag;
   void                 (*do_prach)(struct PHY_VARS_eNB_s *eNB);
   void                 (*fep)(struct PHY_VARS_eNB_s *eNB);
+  int                  (*td)(struct PHY_VARS_eNB_s *eNB,int UE_id,int harq_pid,int llr8_flag);
+  int                  (*te)(struct PHY_VARS_eNB_s *,uint8_t *,uint8_t,LTE_eNB_DLSCH_t *,int,uint8_t,time_stats_t *,time_stats_t *,time_stats_t *);
   void                 (*proc_uespec_rx)(struct PHY_VARS_eNB_s *eNB,eNB_rxtx_proc_t *proc,const relaying_type_t r_type);
   void                 (*proc_tx)(struct PHY_VARS_eNB_s *eNB,eNB_rxtx_proc_t *proc,relaying_type_t r_type,PHY_VARS_RN *rn);
   void                 (*tx_fh)(struct PHY_VARS_eNB_s *eNB,eNB_rxtx_proc_t *proc);
@@ -828,6 +873,69 @@ typedef struct {
 
 } PHY_VARS_UE;
 
+void exit_fun(const char* s);
+
+inline int wait_on_condition(pthread_mutex_t *mutex,pthread_cond_t *cond,int *instance_cnt,char *name) {
+
+  if (pthread_mutex_lock(mutex) != 0) {
+    LOG_E( PHY, "[SCHED][eNB] error locking mutex for %s\n",name);
+    exit_fun("nothing to add");
+    return(-1);
+  }
+  
+  while (*instance_cnt < 0) {
+    // most of the time the thread is waiting here
+    // proc->instance_cnt_rxtx is -1
+    pthread_cond_wait(cond,mutex); // this unlocks mutex_rxtx while waiting and then locks it again
+  }
+
+  if (pthread_mutex_unlock(mutex) != 0) {
+    LOG_E(PHY,"[SCHED][eNB] error unlocking mutex for %s\n",name);
+    exit_fun("nothing to add");
+    return(-1);
+  }
+  return(0);
+}
+
+inline int wait_on_busy_condition(pthread_mutex_t *mutex,pthread_cond_t *cond,int *instance_cnt,char *name) {
+
+  if (pthread_mutex_lock(mutex) != 0) {
+    LOG_E( PHY, "[SCHED][eNB] error locking mutex for %s\n",name);
+    exit_fun("nothing to add");
+    return(-1);
+  }
+  
+  while (*instance_cnt == 0) {
+    // most of the time the thread will skip this
+    // waits only if proc->instance_cnt_rxtx is 0
+    pthread_cond_wait(cond,mutex); // this unlocks mutex_rxtx while waiting and then locks it again
+  }
+
+  if (pthread_mutex_unlock(mutex) != 0) {
+    LOG_E(PHY,"[SCHED][eNB] error unlocking mutex for %s\n",name);
+    exit_fun("nothing to add");
+    return(-1);
+  }
+  return(0);
+}
+
+inline int release_thread(pthread_mutex_t *mutex,int *instance_cnt,char *name) {
+
+  if (pthread_mutex_lock(mutex) != 0) {
+    LOG_E( PHY, "[SCHED][eNB] error locking mutex for %s\n",name);
+    exit_fun("nothing to add");
+    return(-1);
+  }
+  
+  *instance_cnt=*instance_cnt-1;
+  
+  if (pthread_mutex_unlock(mutex) != 0) {
+    LOG_E( PHY, "[SCHED][eNB] error unlocking mutex for %s\n",name);
+    exit_fun("nothing to add");
+    return(-1);
+  }
+  return(0);
+}
 
 
 #include "PHY/INIT/defs.h"
diff --git a/openair1/PHY/extern.h b/openair1/PHY/extern.h
index 8a5bf5e8f5a5bbb9bcc07b7c9d8caeda599b8521..867f980e5f5988bc4331f3bc5ef715d03f0b0d9e 100755
--- a/openair1/PHY/extern.h
+++ b/openair1/PHY/extern.h
@@ -115,5 +115,8 @@ extern char eNB_functions[6][20];
 extern char eNB_timing[2][20];
 
 
+extern int16_t unscrambling_lut[65536*16];
+extern uint8_t scrambling_lut[65536*16];
+
 #endif /*__PHY_EXTERN_H__ */
 
diff --git a/openair1/PHY/impl_defs_lte.h b/openair1/PHY/impl_defs_lte.h
index 7c38048a55ef71e6a014d7fd446c203b90e278ff..bed2057800275dc2797ea44b391fecb43497569a 100644
--- a/openair1/PHY/impl_defs_lte.h
+++ b/openair1/PHY/impl_defs_lte.h
@@ -607,34 +607,34 @@ typedef struct {
   int32_t **txdataF[3];
   /// \brief Holds the received data in time domain.
   /// Should point to the same memory as PHY_vars->rx_vars[a].RX_DMA_BUFFER.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna [0..nb_antennas_rx[
   /// - third index: sample [0..]
   int32_t **rxdata[3];
   /// \brief Holds the last subframe of received data in time domain after removal of 7.5kHz frequency offset.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: secotr id [0..2] (hard coded)
   /// - second index: rx antenna [0..nb_antennas_rx[
   /// - third index: sample [0..samples_per_tti[
   int32_t **rxdata_7_5kHz[3];
   /// \brief Holds the received data in the frequency domain.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna [0..nb_antennas_rx[
   /// - third index: ? [0..2*ofdm_symbol_size*frame_parms->symbols_per_tti[
   int32_t **rxdataF[3];
   /// \brief Holds output of the sync correlator.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: sample [0..samples_per_tti*10[
   uint32_t *sync_corr[3];
 } LTE_eNB_COMMON;
 
 typedef struct {
   /// \brief Hold the channel estimates in frequency domain based on SRS.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..ofdm_symbol_size[
   int32_t **srs_ch_estimates[3];
   /// \brief Hold the channel estimates in time domain based on SRS.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..2*ofdm_symbol_size[
   int32_t **srs_ch_estimates_time[3];
@@ -645,54 +645,54 @@ typedef struct {
 
 typedef struct {
   /// \brief Holds the received data in the frequency domain for the allocated RBs in repeated format.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..2*ofdm_symbol_size[
   /// - third index (definition from phy_init_lte_eNB()): ? [0..24*N_RB_UL*frame_parms->symbols_per_tti[
   /// \warning inconsistent third index definition
   int32_t **rxdataF_ext[3];
   /// \brief Holds the received data in the frequency domain for the allocated RBs in normal format.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index (definition from phy_init_lte_eNB()): ? [0..12*N_RB_UL*frame_parms->symbols_per_tti[
   int32_t **rxdataF_ext2[3];
   /// \brief Hold the channel estimates in time domain based on DRS.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..4*ofdm_symbol_size[
   int32_t **drs_ch_estimates_time[3];
   /// \brief Hold the channel estimates in frequency domain based on DRS.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..12*N_RB_UL*frame_parms->symbols_per_tti[
   int32_t **drs_ch_estimates[3];
   /// \brief Hold the channel estimates for UE0 in case of Distributed Alamouti Scheme.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..12*N_RB_UL*frame_parms->symbols_per_tti[
   int32_t **drs_ch_estimates_0[3];
   /// \brief Hold the channel estimates for UE1 in case of Distributed Almouti Scheme.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..12*N_RB_UL*frame_parms->symbols_per_tti[
   int32_t **drs_ch_estimates_1[3];
   /// \brief Holds the compensated signal.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..12*N_RB_UL*frame_parms->symbols_per_tti[
   int32_t **rxdataF_comp[3];
   /// \brief Hold the compensated data (y)*(h0*) in case of Distributed Alamouti Scheme.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..12*N_RB_UL*frame_parms->symbols_per_tti[
   int32_t **rxdataF_comp_0[3];
   /// \brief Hold the compensated data (y*)*(h1) in case of Distributed Alamouti Scheme.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..12*N_RB_UL*frame_parms->symbols_per_tti[
   int32_t **rxdataF_comp_1[3];
   /// \brief ?.
-  /// - first index: eNB id [0..2] (hard coded)
+  /// - first index: sector id [0..2] (hard coded)
   /// - second index: rx antenna id [0..nb_antennas_rx[
   /// - third index: ? [0..12*N_RB_UL*frame_parms->symbols_per_tti[
   int32_t **ul_ch_mag[3];
diff --git a/openair1/PHY/vars.h b/openair1/PHY/vars.h
index b8cb33239d33c08b2d6a20bb0e0bf089983c7ff9..e773229a28dc06a5dc289d1e739d5a42df141923 100755
--- a/openair1/PHY/vars.h
+++ b/openair1/PHY/vars.h
@@ -142,6 +142,10 @@ double beta2_dlsch[6][MCS_COUNT] = { {2.52163, 0.83231, 0.77472, 1.36536, 1.1682
 char eNB_functions[6][20]={"eNodeB_3GPP","eNodeB_3GPP_BBU","NGFI_RCC_IF4p5","NGFI_RAI_IF4p5","NGFI_RRU_IF5","NGFI_RRU_IF4p5",};
 char eNB_timing[2][20]={"synch_to_ext_device","synch_to_other"};
 
+/// lookup table for unscrambling in RX
+int16_t unscrambling_lut[65536*16] __attribute__((aligned(32)));
+/// lookup table for scrambling in TX
+uint8_t scrambling_lut[65536*16] __attribute__((aligned(32)));
 
 
 #endif /*__PHY_VARS_H__ */
diff --git a/openair1/SCHED/defs.h b/openair1/SCHED/defs.h
index a9f8633e30407292c6b83d78775d20aa6f7e0783..df8652e3dc8b20ed0d01eafedaa665c19bfc158a 100644
--- a/openair1/SCHED/defs.h
+++ b/openair1/SCHED/defs.h
@@ -163,8 +163,9 @@ void phy_procedures_UE_S_RX(PHY_VARS_UE *phy_vars_ue,uint8_t eNB_id,uint8_t abst
   @param abstraction_flag Indicator of PHY abstraction
   @param r_type indicates the relaying operation: 0: no_relaying, 1: unicast relaying type 1, 2: unicast relaying type 2, 3: multicast relaying
   @param phy_vars_rn pointer to the RN variables
+  @param do_meas Do inline timing measurement
 */
-void phy_procedures_eNB_TX(PHY_VARS_eNB *phy_vars_eNB,eNB_rxtx_proc_t *proc,relaying_type_t r_type,PHY_VARS_RN *phy_vars_rn);
+void phy_procedures_eNB_TX(PHY_VARS_eNB *phy_vars_eNB,eNB_rxtx_proc_t *proc,relaying_type_t r_type,PHY_VARS_RN *phy_vars_rn,int do_meas);
 
 /*! \brief Scheduling for eNB RX UE-specific procedures in normal subframes.
   @param phy_vars_eNB Pointer to eNB variables on which to act
diff --git a/openair1/SCHED/phy_procedures_lte_eNb.c b/openair1/SCHED/phy_procedures_lte_eNb.c
index cbea52db810960076c8e69cae9d44697d7d797f2..70b0ce1924ae519b4b4867f81b05e9fb98fa7ff5 100755
--- a/openair1/SCHED/phy_procedures_lte_eNb.c
+++ b/openair1/SCHED/phy_procedures_lte_eNb.c
@@ -1030,14 +1030,14 @@ void pdsch_procedures(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,LTE_eNB_DLSCH_t *d
     LOG_D(PHY,"Generating DLSCH/PDSCH %d\n",ra_flag);
     // 36-212
     start_meas(&eNB->dlsch_encoding_stats);
-    dlsch_encoding(DLSCH_pdu,
-		   fp,
-		   num_pdcch_symbols,
-		   dlsch,
-		   frame,subframe,
-		   &eNB->dlsch_rate_matching_stats,
-		   &eNB->dlsch_turbo_encoding_stats,
-		   &eNB->dlsch_interleaving_stats);
+    eNB->te(eNB,
+	    DLSCH_pdu,
+	    num_pdcch_symbols,
+	    dlsch,
+	    frame,subframe,
+	    &eNB->dlsch_rate_matching_stats,
+	    &eNB->dlsch_turbo_encoding_stats,
+	    &eNB->dlsch_interleaving_stats);
     stop_meas(&eNB->dlsch_encoding_stats);
     // 36-211
     start_meas(&eNB->dlsch_scrambling_stats);
@@ -1053,6 +1053,7 @@ void pdsch_procedures(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,LTE_eNB_DLSCH_t *d
 		     0,
 		     subframe<<1);
     stop_meas(&eNB->dlsch_scrambling_stats);
+
     start_meas(&eNB->dlsch_modulation_stats);
 
 
@@ -1084,7 +1085,8 @@ void pdsch_procedures(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,LTE_eNB_DLSCH_t *d
 void phy_procedures_eNB_TX(PHY_VARS_eNB *eNB,
 			   eNB_rxtx_proc_t *proc,
                            relaying_type_t r_type,
-			   PHY_VARS_RN *rn)
+			   PHY_VARS_RN *rn,
+			   int do_meas)
 {
   UNUSED(rn);
   int frame=proc->frame_tx;
@@ -1113,7 +1115,7 @@ void phy_procedures_eNB_TX(PHY_VARS_eNB *eNB,
   if ((fp->frame_type == TDD) && (subframe_select(fp,subframe)!=SF_DL)) return;
 
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_ENB_TX+offset,1);
-  start_meas(&eNB->phy_proc_tx);
+  if (do_meas==1) start_meas(&eNB->phy_proc_tx);
 
   T(T_ENB_PHY_DL_TICK, T_INT(eNB->Mod_id), T_INT(frame), T_INT(subframe));
 
@@ -1399,7 +1401,7 @@ void phy_procedures_eNB_TX(PHY_VARS_eNB *eNB,
 #endif
 
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_ENB_TX+offset,0);
-  stop_meas(&eNB->phy_proc_tx);
+  if (do_meas==1) stop_meas(&eNB->phy_proc_tx);
   
 }
 
@@ -2522,73 +2524,10 @@ void fep0(PHY_VARS_eNB *eNB,int slot) {
   }
 }
 
-static inline int release_thread(pthread_mutex_t *mutex,int *instance_cnt,char *name) {
-
-  if (pthread_mutex_lock(mutex) != 0) {
-    LOG_E( PHY, "[SCHED][eNB] error locking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  
-  *instance_cnt=*instance_cnt-1;
-  
-  if (pthread_mutex_unlock(mutex) != 0) {
-    LOG_E( PHY, "[SCHED][eNB] error unlocking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  return(0);
-}
-
-static inline int wait_on_condition(pthread_mutex_t *mutex,pthread_cond_t *cond,int *instance_cnt,char *name) {
-
-  if (pthread_mutex_lock(mutex) != 0) {
-    LOG_E( PHY, "[SCHED][eNB] error locking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  
-  while (*instance_cnt < 0) {
-    // most of the time the thread is waiting here
-    // proc->instance_cnt_rxtx is -1
-    pthread_cond_wait(cond,mutex); // this unlocks mutex_rxtx while waiting and then locks it again
-  }
-
-  if (pthread_mutex_unlock(mutex) != 0) {
-    LOG_E(PHY,"[SCHED][eNB] error unlocking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  return(0);
-}
-
-static inline int wait_on_busy_condition(pthread_mutex_t *mutex,pthread_cond_t *cond,int *instance_cnt,char *name) {
-
-  if (pthread_mutex_lock(mutex) != 0) {
-    LOG_E( PHY, "[SCHED][eNB] error locking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  
-  while (*instance_cnt == 0) {
-    // most of the time the thread will skip this
-    // waits only if proc->instance_cnt_rxtx is 0
-    pthread_cond_wait(cond,mutex); // this unlocks mutex_rxtx while waiting and then locks it again
-  }
 
-  if (pthread_mutex_unlock(mutex) != 0) {
-    LOG_E(PHY,"[SCHED][eNB] error unlocking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  return(0);
-}
 
 extern int oai_exit;
 
-#define THREAD_FULL 1
-
-#ifdef THREAD_FULL
 static void *fep_thread(void *param) {
 
   PHY_VARS_eNB *eNB = (PHY_VARS_eNB *)param;
@@ -2611,31 +2550,50 @@ static void *fep_thread(void *param) {
   return(NULL);
 }
 
-#else
+void init_fep_thread(PHY_VARS_eNB *eNB,pthread_attr_t *attr_fep) {
 
-static void *fep_thread(void *param) {
+  eNB_proc_t *proc = &eNB->proc;
 
-  PHY_VARS_eNB *eNB = (PHY_VARS_eNB *)param;
-  eNB_proc_t *proc  = &eNB->proc;
+  proc->instance_cnt_fep         = -1;
+    
+  pthread_mutex_init( &proc->mutex_fep, NULL);
+  pthread_cond_init( &proc->cond_fep, NULL);
+
+  pthread_create(&proc->pthread_fep, attr_fep, fep_thread, (void*)eNB);
 
-  fep0(eNB,0);
 
-  return(NULL);
 }
 
-#endif
-void init_fep_thread(PHY_VARS_eNB *eNB,pthread_attr_t *attr_fep) {
+extern void *td_thread(void*);
+
+void init_td_thread(PHY_VARS_eNB *eNB,pthread_attr_t *attr_td) {
 
   eNB_proc_t *proc = &eNB->proc;
 
-  proc->instance_cnt_fep         = -1;
+  proc->tdp.eNB = eNB;
+  proc->instance_cnt_td         = -1;
     
-  pthread_mutex_init( &proc->mutex_fep, NULL);
-  pthread_cond_init( &proc->cond_fep, NULL);
+  pthread_mutex_init( &proc->mutex_td, NULL);
+  pthread_cond_init( &proc->cond_td, NULL);
 
-#ifdef THREAD_FULL
-  pthread_create(&proc->pthread_fep, attr_fep, fep_thread, (void*)eNB);
-#endif
+  pthread_create(&proc->pthread_td, attr_td, td_thread, (void*)&proc->tdp);
+
+}
+
+extern void *te_thread(void*);
+
+void init_te_thread(PHY_VARS_eNB *eNB,pthread_attr_t *attr_te) {
+
+  eNB_proc_t *proc = &eNB->proc;
+
+  proc->tep.eNB = eNB;
+  proc->instance_cnt_te         = -1;
+    
+  pthread_mutex_init( &proc->mutex_te, NULL);
+  pthread_cond_init( &proc->cond_te, NULL);
+
+  printf("Creating te_thread\n");
+  pthread_create(&proc->pthread_te, attr_te, te_thread, (void*)&proc->tep);
 
 }
 
@@ -2645,15 +2603,13 @@ void eNB_fep_full_2thread(PHY_VARS_eNB *eNB) {
   eNB_proc_t *proc = &eNB->proc;
 
   struct timespec wait;
-  int wait_cnt=0;
+
   wait.tv_sec=0;
   wait.tv_nsec=5000000L;
 
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_ENB_SLOT_FEP,1);
   start_meas(&eNB->ofdm_demod_stats);
 
-#ifdef THREAD_FULL
-
   if (pthread_mutex_timedlock(&proc->mutex_fep,&wait) != 0) {
     printf("[eNB] ERROR pthread_mutex_lock for fep thread %d (IC %d)\n", proc->instance_cnt_fep);
     exit_fun( "error locking mutex_fep" );
@@ -2683,14 +2639,6 @@ void eNB_fep_full_2thread(PHY_VARS_eNB *eNB) {
 
   wait_on_busy_condition(&proc->mutex_fep,&proc->cond_fep,&proc->instance_cnt_fep,"fep thread");  
 
-#else
-
-  pthread_create(&proc->pthread_fep, NULL, fep_thread, (void*)eNB);
-  // call second slot in this symbol
-  fep0(eNB,1);
-  pthread_join(proc->pthread_fep,(void**)NULL);
-
-#endif
   stop_meas(&eNB->ofdm_demod_stats);
 }
 
diff --git a/openair1/SIMULATION/LTE_PHY/dlsim.c b/openair1/SIMULATION/LTE_PHY/dlsim.c
index cfd3da43ca4057e6d7fbd8e77d064278db4e2304..2adbb777d0b5e93f70c88c682287ca4bbc99e64e 100644
--- a/openair1/SIMULATION/LTE_PHY/dlsim.c
+++ b/openair1/SIMULATION/LTE_PHY/dlsim.c
@@ -1381,6 +1381,7 @@ int main(int argc, char **argv)
   char csv_fname[32];
   int dci_flag=1;
   int llr8_flag=1;
+  int two_thread_flag=0;
   int DLSCH_RB_ALLOC;
 
 #if defined(__arm__)
@@ -1412,7 +1413,7 @@ int main(int argc, char **argv)
   //  num_layers = 1;
   perfect_ce = 0;
 
-  while ((c = getopt (argc, argv, "ahdpZDe:Em:n:o:s:f:t:c:g:r:F:x:y:z:AM:N:I:i:O:R:S:C:T:b:u:v:w:B:PLl:XY")) != -1) {
+  while ((c = getopt (argc, argv, "ahdpZDe:Em:n:o:s:f:t:c:g:r:F:x:y:z:AM:N:I:i:O:R:S:C:T:b:u:v:w:B:PLl:WXY")) != -1) {
     switch (c) {
     case 'a':
       awgn_flag = 1;
@@ -1479,7 +1480,10 @@ int main(int argc, char **argv)
     case 'L':
       llr8_flag=1;
       break;
-
+      
+    case 'W':
+      two_thread_flag = 1;
+      break;
     case 'l':
       offset_mumimo_llr_drange_fix=atoi(optarg);
       break;
@@ -1779,6 +1783,15 @@ int main(int argc, char **argv)
 		 perfect_ce);
 
   eNB->mac_enabled=1;
+  if (two_thread_flag == 0) {
+    eNB->te = dlsch_encoding;
+  }
+  else {
+    eNB->te = dlsch_encoding_2threads;
+    init_td_thread(eNB,NULL);
+    init_te_thread(eNB,NULL);
+  }
+
   // callback functions required for phy_procedures_tx 
   mac_xface->get_dci_sdu = get_dci_sdu;
   mac_xface->get_dlsch_sdu = get_dlsch_sdu;
@@ -2272,7 +2285,6 @@ int main(int argc, char **argv)
 
           if (input_fd==NULL) {
 
-            start_meas(&eNB->phy_proc_tx);
 
             // Simulate HARQ procedures!!!
 	    memset(CCE_table,0,800*sizeof(int));
@@ -2345,7 +2357,7 @@ int main(int argc, char **argv)
 	    proc_eNB->subframe_tx = subframe;
 	    eNB->abstraction_flag=0;
  
-	    phy_procedures_eNB_TX(eNB,proc_eNB,no_relay,NULL);
+	    phy_procedures_eNB_TX(eNB,proc_eNB,no_relay,NULL,1);
 	    
 	    
 	    start_meas(&eNB->ofdm_mod_stats);
@@ -2366,7 +2378,7 @@ int main(int argc, char **argv)
 	    
 	    proc_eNB->subframe_tx = subframe+1;
 	    
-	    phy_procedures_eNB_TX(eNB,proc_eNB,no_relay,NULL);
+	    phy_procedures_eNB_TX(eNB,proc_eNB,no_relay,NULL,0);
 	    
 	    do_OFDM_mod_l(eNB->common_vars.txdataF[eNB_id],
 			  eNB->common_vars.txdata[eNB_id],
diff --git a/openair1/SIMULATION/LTE_PHY/ulsim.c b/openair1/SIMULATION/LTE_PHY/ulsim.c
index 2c2b03ce786cfc9233718894370f9da44dcb1d47..744e7b483dfe074dcdebdc33cb178de2f1baa9f9 100644
--- a/openair1/SIMULATION/LTE_PHY/ulsim.c
+++ b/openair1/SIMULATION/LTE_PHY/ulsim.c
@@ -676,8 +676,10 @@ int main(int argc, char **argv)
   eNB->ulsch[0] = new_eNB_ulsch(max_turbo_iterations,N_RB_DL,0);
   UE->ulsch[0]   = new_ue_ulsch(N_RB_DL,0);
 
-  if (parallel_flag == 1) init_fep_thread(eNB,&eNB->proc.attr_fep);
-
+  if (parallel_flag == 1) {
+    init_fep_thread(eNB,NULL);
+    init_td_thread(eNB,NULL);
+  }
   // Create transport channel structures for 2 transport blocks (MIMO)
   for (i=0; i<2; i++) {
     eNB->dlsch[0][i] = new_eNB_dlsch(1,8,1827072,N_RB_DL,0);
@@ -1171,7 +1173,8 @@ int main(int argc, char **argv)
           }
 
 
-	  eNB->fep = (parallel_flag == 1) ? eNB_fep_full_2thread : eNB_fep_full;
+	  eNB->fep = (parallel_flag == 1) ? eNB_fep_full_2thread        : eNB_fep_full;
+	  eNB->td  = (parallel_flag == 1) ? ulsch_decoding_data_2thread : ulsch_decoding_data;
 	  eNB->do_prach = NULL;
 
 	  phy_procedures_eNB_common_RX(eNB);
diff --git a/targets/RT/USER/lte-enb.c b/targets/RT/USER/lte-enb.c
index 82d1e883945db03c467c269e2f4efad2312d1521..c095d467292435580e458f4352d75e20b32b794f 100644
--- a/targets/RT/USER/lte-enb.c
+++ b/targets/RT/USER/lte-enb.c
@@ -279,51 +279,6 @@ static inline void wait_sync(char *thread_name) {
 
 }
 
-static inline int wait_on_condition(pthread_mutex_t *mutex,pthread_cond_t *cond,int *instance_cnt,char *name) {
-
-  struct timespec wait;
-  
-  wait.tv_sec=0;
-  wait.tv_nsec=5000000L;
-
-  if (pthread_mutex_timedlock(mutex,&wait) != 0) {
-    LOG_E( PHY, "[SCHED][eNB] error locking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  
-  while (*instance_cnt < 0) {
-    // most of the time the thread is waiting here
-    // proc->instance_cnt_rxtx is -1
-    pthread_cond_wait(cond,mutex); // this unlocks mutex_rxtx while waiting and then locks it again
-  }
-
-  if (pthread_mutex_unlock(mutex) != 0) {
-    LOG_E(PHY,"[SCHED][eNB] error unlocking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  return(0);
-}
-
-static inline int release_thread(pthread_mutex_t *mutex,int *instance_cnt,char *name) {
-
-  if (pthread_mutex_lock(mutex) != 0) {
-    LOG_E( PHY, "[SCHED][eNB] error locking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  
-  *instance_cnt=*instance_cnt-1;
-  
-  if (pthread_mutex_unlock(mutex) != 0) {
-    LOG_E( PHY, "[SCHED][eNB] error unlocking mutex for %s\n",name);
-    exit_fun("nothing to add");
-    return(-1);
-  }
-  return(0);
-}
-
 void do_OFDM_mod_rt(int subframe,PHY_VARS_eNB *phy_vars_eNB) {
      
   unsigned int aa,slot_offset, slot_offset_F;
@@ -470,7 +425,7 @@ void proc_tx_high0(PHY_VARS_eNB *eNB,
   VCD_SIGNAL_DUMPER_DUMP_VARIABLE_BY_NAME( VCD_SIGNAL_DUMPER_VARIABLES_FRAME_NUMBER_TX0_ENB+offset, proc->frame_tx );
   VCD_SIGNAL_DUMPER_DUMP_VARIABLE_BY_NAME( VCD_SIGNAL_DUMPER_VARIABLES_SUBFRAME_NUMBER_TX0_ENB+offset, proc->subframe_tx );
 
-  phy_procedures_eNB_TX(eNB,proc,r_type,rn);
+  phy_procedures_eNB_TX(eNB,proc,r_type,rn,1);
 
   /* we're done, let the next one proceed */
   if (pthread_mutex_lock(&sync_phy_proc.mutex_phy_proc_tx) != 0) {
@@ -1344,7 +1299,7 @@ void init_eNB_proc(int inst) {
   PHY_VARS_eNB *eNB;
   eNB_proc_t *proc;
   eNB_rxtx_proc_t *proc_rxtx;
-  pthread_attr_t *attr0=NULL,*attr1=NULL,*attr_FH=NULL,*attr_prach=NULL,*attr_asynch=NULL,*attr_single=NULL,*attr_fep=NULL;
+  pthread_attr_t *attr0=NULL,*attr1=NULL,*attr_FH=NULL,*attr_prach=NULL,*attr_asynch=NULL,*attr_single=NULL,*attr_fep=NULL,*attr_td=NULL;
 
   for (CC_id=0; CC_id<MAX_NUM_CCs; CC_id++) {
     eNB = PHY_vars_eNB_g[inst][CC_id];
@@ -1379,6 +1334,8 @@ void init_eNB_proc(int inst) {
     pthread_attr_init( &proc->attr_asynch_rxtx);
     pthread_attr_init( &proc->attr_single);
     pthread_attr_init( &proc->attr_fep);
+    pthread_attr_init( &proc->attr_td);
+    pthread_attr_init( &proc->attr_te);
     pthread_attr_init( &proc_rxtx[0].attr_rxtx);
     pthread_attr_init( &proc_rxtx[1].attr_rxtx);
 #ifndef DEADLINE_SCHEDULER
@@ -1389,6 +1346,8 @@ void init_eNB_proc(int inst) {
     attr_asynch = &proc->attr_asynch_rxtx;
     attr_single = &proc->attr_single;
     attr_fep    = &proc->attr_fep;
+    attr_td     = &proc->attr_td;
+    attr_te     = &proc->attr_te; 
 #endif
 
     if (eNB->single_thread_flag==0) {
@@ -1399,6 +1358,8 @@ void init_eNB_proc(int inst) {
     else {
       pthread_create(&proc->pthread_single, attr_single, eNB_thread_single, &eNB->proc);
       init_fep_thread(eNB,attr_fep);
+      init_td_thread(eNB,attr_td);
+      init_te_thread(eNB,attr_te);
     }
     pthread_create( &proc->pthread_prach, attr_prach, eNB_thread_prach, &eNB->proc );
     if ((eNB->node_timing == synch_to_other) ||
@@ -1625,6 +1586,8 @@ void init_eNB(eNB_func_t node_function[], eNB_timing_t node_timing[],int nb_inst
       case NGFI_RRU_IF5:
 	eNB->do_prach             = NULL;
 	eNB->fep                  = eNB_fep_rru_if5;
+	eNB->td                   = NULL;
+	eNB->te                   = NULL;
 	eNB->proc_uespec_rx       = NULL;
 	eNB->proc_tx              = NULL;
 	eNB->tx_fh                = NULL;
@@ -1649,6 +1612,8 @@ void init_eNB(eNB_func_t node_function[], eNB_timing_t node_timing[],int nb_inst
       case NGFI_RRU_IF4p5:
 	eNB->do_prach             = do_prach;
 	eNB->fep                  = eNB_fep_full;
+	eNB->td                   = NULL;
+	eNB->te                   = NULL;
 	eNB->proc_uespec_rx       = NULL;
 	eNB->proc_tx              = NULL;//proc_tx_rru_if4p5;
 	eNB->tx_fh                = NULL;
@@ -1676,6 +1641,8 @@ void init_eNB(eNB_func_t node_function[], eNB_timing_t node_timing[],int nb_inst
       case eNodeB_3GPP:
 	eNB->do_prach             = do_prach;
 	eNB->fep                  = eNB_fep_full;
+	eNB->td                   = ulsch_decoding_data_2thread;
+	eNB->te                   = dlsch_encoding_2thread;
 	eNB->proc_uespec_rx       = phy_procedures_eNB_uespec_RX;
 	eNB->proc_tx              = proc_tx_full;
 	eNB->tx_fh                = NULL;
@@ -1694,6 +1661,8 @@ void init_eNB(eNB_func_t node_function[], eNB_timing_t node_timing[],int nb_inst
       case eNodeB_3GPP_BBU:
 	eNB->do_prach       = do_prach;
 	eNB->fep            = eNB_fep_full;
+	eNB->td             = ulsch_decoding_data_2thread;
+	eNB->te             = dlsch_encoding_2thread;
 	eNB->proc_uespec_rx = phy_procedures_eNB_uespec_RX;
 	eNB->proc_tx        = proc_tx_full;
 	eNB->tx_fh          = tx_fh_if5;
@@ -1716,6 +1685,8 @@ void init_eNB(eNB_func_t node_function[], eNB_timing_t node_timing[],int nb_inst
       case NGFI_RCC_IF4p5:
 	eNB->do_prach             = do_prach;
 	eNB->fep                  = NULL;
+	eNB->td                   = ulsch_decoding_data_2thread;
+	eNB->te                   = dlsch_encoding_2thread;
 	eNB->proc_uespec_rx       = phy_procedures_eNB_uespec_RX;
 	eNB->proc_tx              = proc_tx_high;
 	eNB->tx_fh                = tx_fh_if4p5;
@@ -1737,6 +1708,8 @@ void init_eNB(eNB_func_t node_function[], eNB_timing_t node_timing[],int nb_inst
       case NGFI_RAU_IF4p5:
 	eNB->do_prach       = do_prach;
 	eNB->fep            = NULL;
+	eNB->td             = ulsch_decoding_data_2thread;
+	eNB->te             = dlsch_encoding_2thread;
 	eNB->proc_uespec_rx = phy_procedures_eNB_uespec_RX;
 	eNB->proc_tx        = proc_tx_high;
 	eNB->tx_fh          = tx_fh_if4p5;