Added script to parse chips.

parent 3230f1d1
CHIP files are matrices in plain text format where each row corresponds to genomic
information about a human subject. Each column corresponds to a chip which in turn
is associated to a gene. The value of the matrix at row i and column j can take one
of three possible values:
0 - chip j of subject i matches that of a a given canonical genome
1 - chip j of subject i differs from that of the canonical genome
NA - no information is available about this chip w.r.t. the canonical genome
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
/**
* The input to this program is a plain text file matrix with three entries: 0, 1 or NA
* The NA indicates missing data.
*
* The program produces two binary matrices as output: one with a 1 for every NA encountered and 0 otherwise; this
* is the MASK file. The second file contains the 0's and 1's from the input file, and 0 where an NA was found.
* This is the DATA file. The 0's in the latter case are of course invalid, and just a placeholder.
*
* The output binary matrices are encoded as PBM pseudo-images.
*/
int main(int argc, char* argv[]) {
FILE* fin = NULL;
FILE* fdata = NULL;
FILE* fmask = NULL;
char* line = NULL;
size_t len;
size_t m = 0,n = 0;
ssize_t r;
char* tok;
char ofname[128];
char* prefix = "chip";
fin = fopen(argv[1],"r");
if(!fin) return 1;
if (argc > 2) {
prefix = argv[2];
}
//
// first pass: scan number of rows and columns in file
//
printf("FIRST PASS\n");
r = getline(&line,&len,fin); // skip header
//
// 1.1 scan number of columns from header
tok = strtok(line," \n\t\r");
while (tok) {
n++;
tok = strtok(NULL," \n\t\r");
}
if (n == 0) {
fprintf(stderr,"Empty or invalid file: %s\n",argv[1]);
exit(5);
}
printf("Number of columns: %lu\n",n);
//
// 1.2 scan number of rows
//
while ((r = getline(&line,&len,fin)) > 0) {
m++;
}
printf("Number of rows: %lu\n",m);
//
// second pass: generate dist and mask
//
printf("\nSECOND PASS.\n");
unsigned nbytes = (unsigned)ceil( (double)n/8.0 );
printf("Bytes per row: %u\n",nbytes);
printf("Total data bytes: %lu\n",nbytes*m*2);
snprintf(ofname,128,"%s-data.pbm",prefix);
fdata = fopen(ofname,"w");
if (!fdata) return 2;
snprintf(ofname,128,"%s-mask.pbm",prefix);
fmask = fopen(ofname,"w");
if (!fmask) return 3;
fprintf(fdata,"P4 %lu %lu\n",n,m);
fprintf(fmask,"P4 %lu %lu\n",n,m);
// now parse each line; write to fdata and fmask
//
unsigned long i = 0;
rewind(fin);
r = getline(&line,&len,fin); // skip header
while (getline(&line,&len,fin) > 0) {
tok = strtok(line," \n\t\r"); // skip gene name
size_t j = 0;
unsigned char cd = 0, cm = 0; // bytes to write to data and mask files
unsigned int mask = 0x80; // binary mask for modifying bytes
while (tok) {
// read data entry from text file ('0','1' or 'NA')
const char a = tok[0];
if (a == 'N') {
cm |= mask;
} else if (a == '1') {
cd |= mask;
}
// update binary mask
mask >>= 1;
if (!mask) { // mask reached 0: flush bytes and reinit.
fputc(cd,fdata);
fputc(cm,fmask);
mask = 0x80;
cd = cm = 0;
}
tok = strtok(NULL," \n\t\r");
} // end line
if (mask) { // did not finish byte: flush with trailing bits set to 0
fputc(cd,fdata);
fputc(cm,fmask);
//printf("Tail mask=%x dist=%x\n",(unsigned int)cd,(unsigned int)cm);
}
if (!(i % 10)) {
printf("%10lu/%10lu\r",i,m);
fflush(stdout);
}
i++;
} // second pass loop
printf("\nDONE.\n");
fclose(fin);
fclose(fmask);
fclose(fdata);
free(line);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment