diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt
index e1396b639845d60107acfd5415393da4d8353ce6..9f3c0ece0ddcf88b299d4d830486f83be93694e9 100644
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -178,7 +178,7 @@ set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath -Wl,${CMAKE_CU
 # these changes are related to hardcoded path to include .h files
 add_definitions(-DCMAKER)
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3")
-set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O3") 
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2") 
 
 
 set(GIT_BRANCH        "UNKNOWN")
diff --git a/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c b/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
index 6ed0f91101d211a719f01ddb64e903bb5f75681f..1d0c8c25e324c54d50c2ab9ec0a8114d9058a004 100644
--- a/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
+++ b/openair1/PHY/LTE_TRANSPORT/ulsch_decoding.c
@@ -127,7 +127,7 @@ LTE_eNB_ULSCH_t *new_eNB_ulsch(uint8_t max_turbo_iterations,uint8_t N_RB_UL, uin
     ulsch->Mlimit = 4;
 
     for (i=0; i<8; i++) {
-      //      msg("new_ue_ulsch: Harq process %d\n",i);
+      //      printf("new_ue_ulsch: Harq process %d\n",i);
       ulsch->harq_processes[i] = (LTE_UL_eNB_HARQ_t *)malloc16(sizeof(LTE_UL_eNB_HARQ_t));
 
       if (ulsch->harq_processes[i]) {
@@ -202,11 +202,11 @@ uint8_t extract_cqi_crc(uint8_t *cqi,uint8_t CQI_LENGTH)
   uint8_t crc;
 
   crc = cqi[CQI_LENGTH>>3];
-  //  msg("crc1: %x, shift %d\n",crc,CQI_LENGTH&0x7);
+  //  printf("crc1: %x, shift %d\n",crc,CQI_LENGTH&0x7);
   crc = (crc<<(CQI_LENGTH&0x7));
   // clear crc bits
   //  ((char *)cqi)[CQI_LENGTH>>3] &= 0xff>>(8-(CQI_LENGTH&0x7));
-  //  msg("crc2: %x, cqi0 %x\n",crc,cqi[1+(CQI_LENGTH>>3)]);
+  //  printf("crc2: %x, cqi0 %x\n",crc,cqi[1+(CQI_LENGTH>>3)]);
   crc |= (cqi[1+(CQI_LENGTH>>3)])>>(8-(CQI_LENGTH&0x7));
   // clear crc bits
   //(((char *)cqi)[1+(CQI_LENGTH>>3)]) = 0;
@@ -217,7 +217,54 @@ uint8_t extract_cqi_crc(uint8_t *cqi,uint8_t CQI_LENGTH)
 }
 
 
+typedef struct {
+  PHY_VARS_eNB *eNB;
+  
+} tc_param;
+
+static void *td_thread(void *param) {
+
+  PHY_VARS_eNB *eNB = (tc_param*)param->eNB;
+  eNB_proc_t *proc  = &eNB->proc;
+
+  while (!oai_exit) {
+
+    if (wait_on_condition(&proc->mutex_td,&proc->cond_td,&proc->instance_cnt_td,"td thread")<0) break;  
+    // TD here
+    ret = tc(&ulsch_harq->d[r][96],
+	     ulsch_harq->c[r],
+	     Kr,
+	     f1f2mat_old[iind*2],
+	     f1f2mat_old[(iind*2)+1],
+	     ulsch->max_turbo_iterations,//MAX_TURBO_ITERATIONS,
+	     crc_type,
+	     (r==0) ? ulsch_harq->F : 0,
+	     &eNB->ulsch_tc_init_stats,
+	     &eNB->ulsch_tc_alpha_stats,
+	     &eNB->ulsch_tc_beta_stats,
+	     &eNB->ulsch_tc_gamma_stats,
+	     &eNB->ulsch_tc_ext_stats,
+	     &eNB->ulsch_tc_intl1_stats,
+	     &eNB->ulsch_tc_intl2_stats);
+    
+    stop_meas(&eNB->ulsch_turbo_decoding_stats);
+    
+    status[r] = ret;
+
+    if (release_thread(&proc->mutex_td,&proc->instance_cnt_td,"td thread")<0) break;
+
+    if (pthread_cond_signal(&proc->cond_td) != 0) {
+      printf("[eNB] ERROR pthread_cond_signal for td thread exit\n");
+      exit_fun( "ERROR pthread_cond_signal" );
+      return;
+    }
+  }
+
 
+
+  return(NULL);
+}
+  
 unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
                              uint8_t UE_id,
                              uint8_t control_only_flag,
@@ -312,7 +359,7 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
 
 
 #ifdef DEBUG_ULSCH_DECODING
-  msg("ulsch_decoding (Nid_cell %d, rnti %x, x2 %x): round %d, RV %d, mcs %d, O_RI %d, O_ACK %d, G %d, subframe %d\n",
+  printf("ulsch_decoding (Nid_cell %d, rnti %x, x2 %x): round %d, RV %d, mcs %d, O_RI %d, O_ACK %d, G %d, subframe %d\n",
       frame_parms->Nid_cell,ulsch->rnti,x2,
       ulsch_harq->round,
       ulsch_harq->rvidx,
@@ -401,7 +448,7 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
   //  Q_ACK = Qprime * Q_m;
   Qprime_ACK = Qprime;
 #ifdef DEBUG_ULSCH_DECODING
-  msg("ulsch_decoding.c: Qprime_ACK %d, Msc_initial %d, Nsymb_initial %d, sumKr %d\n",
+  printf("ulsch_decoding.c: Qprime_ACK %d, Msc_initial %d, Nsymb_initial %d, sumKr %d\n",
       Qprime_ACK,ulsch_harq->Msc_initial,ulsch_harq->Nsymb_initial,sumKr);
 #endif
 
@@ -430,7 +477,7 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
 
   Q_CQI = Q_m * Qprime;
 #ifdef DEBUG_ULSCH_DECODING
-  msg("ulsch_decoding: G %d, Q_RI %d, Q_CQI %d (L %d, Or1 %d) O_ACK %d\n",G,Q_RI,Q_CQI,L,ulsch_harq->Or1,ulsch_harq->O_ACK);
+  printf("ulsch_decoding: G %d, Q_RI %d, Q_CQI %d (L %d, Or1 %d) O_ACK %d\n",G,Q_RI,Q_CQI,L,ulsch_harq->Or1,ulsch_harq->O_ACK);
 #endif
 
   G = G - Q_RI - Q_CQI;
@@ -531,7 +578,7 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
     }
 
 #ifdef DEBUG_ULSCH_DECODING
-    msg("ulsch_decoding.c: ACK i %d, r %d, j %d, ColumnSet[j] %d\n",i,r,j,columnset[j]);
+    printf("ulsch_decoding.c: ACK i %d, r %d, j %d, ColumnSet[j] %d\n",i,r,j,columnset[j]);
 #endif
     j=(j+3)&3;
   }
@@ -964,15 +1011,15 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
     }
 
 #ifdef DEBUG_ULSCH_DECODING
-    msg("ulsch_decoding: Or1=%d\n",ulsch_harq->Or1);
+    printf("ulsch_decoding: Or1=%d\n",ulsch_harq->Or1);
 
     for (i=0; i<1+((8+ulsch_harq->Or1)/8); i++)
-      msg("ulsch_decoding: O[%d] %d\n",i,ulsch_harq->o[i]);
+      printf("ulsch_decoding: O[%d] %d\n",i,ulsch_harq->o[i]);
 
     if (ulsch_harq->cqi_crc_status == 1)
-      msg("RX CQI CRC OK (%x)\n",extract_cqi_crc(o_flip,ulsch_harq->Or1));
+      printf("RX CQI CRC OK (%x)\n",extract_cqi_crc(o_flip,ulsch_harq->Or1));
     else
-      msg("RX CQI CRC NOT OK (%x)\n",extract_cqi_crc(o_flip,ulsch_harq->Or1));
+      printf("RX CQI CRC NOT OK (%x)\n",extract_cqi_crc(o_flip,ulsch_harq->Or1));
 
 #endif
   }
@@ -1010,7 +1057,7 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
     }
 
 #ifdef DEBUG_ULSCH_DECODING
-    msg("f1 %d, f2 %d, F %d\n",f1f2mat_old[2*iind],f1f2mat_old[1+(2*iind)],(r==0) ? ulsch_harq->F : 0);
+    printf("f1 %d, f2 %d, F %d\n",f1f2mat_old[2*iind],f1f2mat_old[1+(2*iind)],(r==0) ? ulsch_harq->F : 0);
 #endif
 
     memset(&dummy_w[r][0],0,3*(6144+64)*sizeof(short));
@@ -1019,7 +1066,7 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
                                           (r==0) ? ulsch_harq->F : 0);
 
 #ifdef DEBUG_ULSCH_DECODING
-    msg("Rate Matching Segment %d (coded bits (G) %d,unpunctured/repeated bits %d, Q_m %d, nb_rb %d, Nl %d)...\n",
+    printf("Rate Matching Segment %d (coded bits (G) %d,unpunctured/repeated bits %d, Q_m %d, nb_rb %d, Nl %d)...\n",
         r, G,
         Kr*3,
         Q_m,
@@ -1058,40 +1105,41 @@ unsigned int  ulsch_decoding(PHY_VARS_eNB *eNB,eNB_rxtx_proc_t *proc,
     stop_meas(&eNB->ulsch_deinterleaving_stats);
   }
 
-    for (r=0; r<ulsch_harq->C; r++) {
-
-      /*      printf("c[%d] : %p\n",r,
-	     ulsch_harq->c[r]);
-      */
-
-      if (ulsch_harq->C == 1)
-        crc_type = CRC24_A;
-      else
-        crc_type = CRC24_B;
-
-      start_meas(&eNB->ulsch_turbo_decoding_stats);
-
-      ret = tc(&ulsch_harq->d[r][96],
-               ulsch_harq->c[r],
-               Kr,
-               f1f2mat_old[iind*2],
-               f1f2mat_old[(iind*2)+1],
-               ulsch->max_turbo_iterations,//MAX_TURBO_ITERATIONS,
-               crc_type,
-               (r==0) ? ulsch_harq->F : 0,
-               &eNB->ulsch_tc_init_stats,
-               &eNB->ulsch_tc_alpha_stats,
-               &eNB->ulsch_tc_beta_stats,
-               &eNB->ulsch_tc_gamma_stats,
-               &eNB->ulsch_tc_ext_stats,
-               &eNB->ulsch_tc_intl1_stats,
-               &eNB->ulsch_tc_intl2_stats);
-
-      stop_meas(&eNB->ulsch_turbo_decoding_stats);
-
-      status[r] = ret;
-
-    }
+  for (r=0; r<ulsch_harq->C; r+=2) {
+    
+    /*      printf("c[%d] : %p\n",r,
+	    ulsch_harq->c[r]);
+    */
+    
+    if (ulsch_harq->C == 1)
+      crc_type = CRC24_A;
+    else
+      crc_type = CRC24_B;
+    
+    start_meas(&eNB->ulsch_turbo_decoding_stats);
+    
+    ret = tc(&ulsch_harq->d[r][96],
+	     ulsch_harq->c[r],
+	     Kr,
+	     f1f2mat_old[iind*2],
+	     f1f2mat_old[(iind*2)+1],
+	     ulsch->max_turbo_iterations,//MAX_TURBO_ITERATIONS,
+	     crc_type,
+	     (r==0) ? ulsch_harq->F : 0,
+	     &eNB->ulsch_tc_init_stats,
+	     &eNB->ulsch_tc_alpha_stats,
+	     &eNB->ulsch_tc_beta_stats,
+	     &eNB->ulsch_tc_gamma_stats,
+	     &eNB->ulsch_tc_ext_stats,
+	     &eNB->ulsch_tc_intl1_stats,
+	     &eNB->ulsch_tc_intl2_stats);
+    
+    stop_meas(&eNB->ulsch_turbo_decoding_stats);
+    
+    status[r] = ret;
+    if (ret==(1+ulsch->max_turbo_iterations))
+	break;
+  }
 
   // Reassembly of Transport block here
   offset = 0;
@@ -1208,7 +1256,7 @@ int ulsch_abstraction_MIESM(double* sinr_dB,uint8_t TM, uint8_t mcs,uint16_t nrb
     sinr_db1 = sinr_dB[offset*2];
     sinr_db2 = sinr_dB[offset*2+1];
 
-    msg("sinr_db1=%f\n,sinr_db2=%f\n",sinr_db1,sinr_db2);
+    printf("sinr_db1=%f\n,sinr_db2=%f\n",sinr_db1,sinr_db2);
 
     //rounding up for the table lookup
     sinr_db1 *= 10;
@@ -1462,7 +1510,7 @@ int ulsch_abstraction_MIESM(double* sinr_dB,uint8_t TM, uint8_t mcs,uint16_t nrb
     sinr_eff = sinr_eff * beta2_dlsch_MI[TM][mcs];
   }
 
-  msg("SINR_Eff = %e\n",sinr_eff);
+  printf("SINR_Eff = %e\n",sinr_eff);
 
   sinr_eff *= 10;
   sinr_eff = floor(sinr_eff);
@@ -1470,7 +1518,7 @@ int ulsch_abstraction_MIESM(double* sinr_dB,uint8_t TM, uint8_t mcs,uint16_t nrb
   //   sinr_eff += 1;
   // }
   sinr_eff /= 10;
-  msg("sinr_eff after rounding = %f\n",sinr_eff);
+  printf("sinr_eff after rounding = %f\n",sinr_eff);
 
   for (index = 0; index < 16; index++) {
     if(index == 0) {
@@ -1488,10 +1536,10 @@ int ulsch_abstraction_MIESM(double* sinr_dB,uint8_t TM, uint8_t mcs,uint16_t nrb
 #ifdef USER_MODE // need to be adapted for the emulation in the kernel space 
 
   if (uniformrandom() < bler) {
-    msg("abstraction_decoding failed (mcs=%d, sinr_eff=%f, bler=%f)\n",mcs,sinr_eff,bler);
+    printf("abstraction_decoding failed (mcs=%d, sinr_eff=%f, bler=%f)\n",mcs,sinr_eff,bler);
     return(0);
   } else {
-    msg("abstraction_decoding successful (mcs=%d, sinr_eff=%f, bler=%f)\n",mcs,sinr_eff,bler);
+    printf("abstraction_decoding successful (mcs=%d, sinr_eff=%f, bler=%f)\n",mcs,sinr_eff,bler);
     return(1);
   }
 
diff --git a/openair1/SCHED/phy_procedures_lte_eNb.c b/openair1/SCHED/phy_procedures_lte_eNb.c
index 980fa3335a232783431a7bb81d1ec43f7615240f..cbea52db810960076c8e69cae9d44697d7d797f2 100755
--- a/openair1/SCHED/phy_procedures_lte_eNb.c
+++ b/openair1/SCHED/phy_procedures_lte_eNb.c
@@ -2508,6 +2508,8 @@ void fep0(PHY_VARS_eNB *eNB,int slot) {
   LTE_DL_FRAME_PARMS *fp = &eNB->frame_parms;
   int l;
 
+  //  printf("fep0: slot %d\n",slot);
+
   remove_7_5_kHz(eNB,(slot&1)+(proc->subframe_rx<<1));
   for (l=0; l<fp->symbols_per_tti/2; l++) {
     slot_fep_ul(fp,
@@ -2560,21 +2562,68 @@ static inline int wait_on_condition(pthread_mutex_t *mutex,pthread_cond_t *cond,
   return(0);
 }
 
+static inline int wait_on_busy_condition(pthread_mutex_t *mutex,pthread_cond_t *cond,int *instance_cnt,char *name) {
+
+  if (pthread_mutex_lock(mutex) != 0) {
+    LOG_E( PHY, "[SCHED][eNB] error locking mutex for %s\n",name);
+    exit_fun("nothing to add");
+    return(-1);
+  }
+  
+  while (*instance_cnt == 0) {
+    // most of the time the thread will skip this
+    // waits only if proc->instance_cnt_rxtx is 0
+    pthread_cond_wait(cond,mutex); // this unlocks mutex_rxtx while waiting and then locks it again
+  }
+
+  if (pthread_mutex_unlock(mutex) != 0) {
+    LOG_E(PHY,"[SCHED][eNB] error unlocking mutex for %s\n",name);
+    exit_fun("nothing to add");
+    return(-1);
+  }
+  return(0);
+}
+
 extern int oai_exit;
 
+#define THREAD_FULL 1
 
+#ifdef THREAD_FULL
 static void *fep_thread(void *param) {
 
   PHY_VARS_eNB *eNB = (PHY_VARS_eNB *)param;
   eNB_proc_t *proc  = &eNB->proc;
   while (!oai_exit) {
+
     if (wait_on_condition(&proc->mutex_fep,&proc->cond_fep,&proc->instance_cnt_fep,"fep thread")<0) break;  
     fep0(eNB,0);
     if (release_thread(&proc->mutex_fep,&proc->instance_cnt_fep,"fep thread")<0) break;
+
+    if (pthread_cond_signal(&proc->cond_fep) != 0) {
+      printf("[eNB] ERROR pthread_cond_signal for fep thread exit\n");
+      exit_fun( "ERROR pthread_cond_signal" );
+      return;
+    }
   }
+
+
+
+  return(NULL);
+}
+
+#else
+
+static void *fep_thread(void *param) {
+
+  PHY_VARS_eNB *eNB = (PHY_VARS_eNB *)param;
+  eNB_proc_t *proc  = &eNB->proc;
+
+  fep0(eNB,0);
+
   return(NULL);
 }
 
+#endif
 void init_fep_thread(PHY_VARS_eNB *eNB,pthread_attr_t *attr_fep) {
 
   eNB_proc_t *proc = &eNB->proc;
@@ -2584,13 +2633,17 @@ void init_fep_thread(PHY_VARS_eNB *eNB,pthread_attr_t *attr_fep) {
   pthread_mutex_init( &proc->mutex_fep, NULL);
   pthread_cond_init( &proc->cond_fep, NULL);
 
+#ifdef THREAD_FULL
   pthread_create(&proc->pthread_fep, attr_fep, fep_thread, (void*)eNB);
+#endif
 
 }
 
+
 void eNB_fep_full_2thread(PHY_VARS_eNB *eNB) {
 
   eNB_proc_t *proc = &eNB->proc;
+
   struct timespec wait;
   int wait_cnt=0;
   wait.tv_sec=0;
@@ -2599,6 +2652,8 @@ void eNB_fep_full_2thread(PHY_VARS_eNB *eNB) {
   VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_ENB_SLOT_FEP,1);
   start_meas(&eNB->ofdm_demod_stats);
 
+#ifdef THREAD_FULL
+
   if (pthread_mutex_timedlock(&proc->mutex_fep,&wait) != 0) {
     printf("[eNB] ERROR pthread_mutex_lock for fep thread %d (IC %d)\n", proc->instance_cnt_fep);
     exit_fun( "error locking mutex_fep" );
@@ -2626,26 +2681,21 @@ void eNB_fep_full_2thread(PHY_VARS_eNB *eNB) {
   // call second slot in this symbol
   fep0(eNB,1);
 
-  if (pthread_mutex_timedlock(&proc->mutex_fep,&wait) != 0) {
-    printf("[eNB] ERROR pthread_mutex_lock for fep thread %d (IC %d)\n", proc->instance_cnt_fep);
-    exit_fun( "error locking mutex_fep" );
-    return;
-  }
-  while (proc->instance_cnt_fep==0) {
-    wait_cnt++;
-    if (wait_cnt>10000)
-      break;
-  };
+  wait_on_busy_condition(&proc->mutex_fep,&proc->cond_fep,&proc->instance_cnt_fep,"fep thread");  
 
-  pthread_mutex_unlock( &proc->mutex_fep );
-  if (wait_cnt>1000000) {
-    printf("[eNB] parallel FEP didn't finish\n");
-    exit_fun( "error" );
-  }
+#else
+
+  pthread_create(&proc->pthread_fep, NULL, fep_thread, (void*)eNB);
+  // call second slot in this symbol
+  fep0(eNB,1);
+  pthread_join(proc->pthread_fep,(void**)NULL);
 
+#endif
   stop_meas(&eNB->ofdm_demod_stats);
 }
 
+
+
 void eNB_fep_full(PHY_VARS_eNB *eNB) {
 
   eNB_proc_t *proc = &eNB->proc;
diff --git a/targets/ARCH/USRP/USERSPACE/LIB/usrp_lib.cpp b/targets/ARCH/USRP/USERSPACE/LIB/usrp_lib.cpp
index f9c2309ab84c9b975468580299460809dd73f858..56a0f50ff4e3e86da118394cd4c609b2309d9c22 100644
--- a/targets/ARCH/USRP/USERSPACE/LIB/usrp_lib.cpp
+++ b/targets/ARCH/USRP/USERSPACE/LIB/usrp_lib.cpp
@@ -568,7 +568,7 @@ extern "C" {
     // workaround for an api problem, master clock has to be set with the constructor not via set_master_clock_rate
     args += boost::str(boost::format(",master_clock_rate=%f") % usrp_master_clock);
     
-    args += ",num_send_frames=256,num_recv_frames=256, send_frame_size=4096, recv_frame_size=4096";
+//    args += ",num_send_frames=256,num_recv_frames=256, send_frame_size=4096, recv_frame_size=4096";
     
     uhd::device_addrs_t device_adds = uhd::device::find(args);