/* * CSSE, Monash University, Clayton, Victoria 3800, Australia * CSE2304 Prac 4, Group A * Semester 1, May 2003 * Sample Solution - O.R. * * dna k file1 file2 ... * Outputs an adjacency matrix of the 'distance' from one DNA source to * another, measured as the effectiveness of coding each sequence by a * Markov model, of order k, based on all other input sequences. */ #include #include #include #include #include #include /* Used to indicate a 'missing value' in the adjacency matrix */ #define EMPTY -1 /* delta, used for probability estimation */ #define delta 0.5 /* True if the character x is a valid DNA base */ #define isBase(x) (x == 'a'\ || x == 'A'\ || x == 'c'\ || x == 'C'\ || x == 'g'\ || x == 'G'\ || x == 't'\ || x == 'T') /* Maps the DNA base x to a number: * a == 0 * c == 1 * g == 2 * t == 3 */ #define baseEquiv(x) ((x)<'d'?((x)=='a'?0:1):((x)=='g'?2:3)) // Self-explanatory #define log2(x) (log(x)/log(2)) #define max(x,y) (x>y?x:y) #define min(x,y) (x 0 && rows > 0 && cols > 0); /* Allocate first dimension (rows) */ if(!(t = calloc(sizeof(void *), rows))) /* Handle failure */ return NULL; /* Allocate each row */ for(i=0; i 0 then fill context array */ if(k > 0) { while((c = getc(in)) != EOF) { if(isBase(c)) { /* Have read a base */ /* Store in context and move on */ c = tolower(c); if(out) { /* write to bases output file */ putc(c, out); } context[end] = c; end = (end + 1) % k; if(end == 0) { /* Context is full */ break; } } } } /* Finish processing input file */ while((c = getc(in)) != EOF) { if(isBase(c)) { /* Have read a base. Remember and overwrite earliest * value in context */ c = tolower(c); if(out) { /* write to bases output file */ putc(c, out); } prev = dnaEquiv(context, end, k); if(k > 0) { context[end] = c; end = (end + 1) % k; } /* Add context/base sequence to our Markov model */ model[baseEquiv(c)][prev]++; } } /* Sum columns to determine totals */ for(i=0; i<(1 << (k << 1)); i++) { model[4][i] = model[0][i] + model[1][i] + model[2][i] + model[3][i]; } } /* outputAdjacencyMatrix * Prints to stdout the adjacency matrix adj using reasonably well-formatted * output. size is the length of both dimensions of the matrix and the * number of labels. */ void outputAdjacencyMatrix(double **adj, int size, char **labels) { int i, j, maxlen=7; /* Calculate max label length */ for(i=0; i 0 then fill context array */ if(k > 0) { while((c = getc(in)) != EOF) { if(isBase(c)) { /* Have read a base */ /* Not enough context; use default * probability */ sum += -log2(0.25); /* Store in context and move on */ c = tolower(c); context[end] = c; end = (end + 1) % k; if(end == 0) /* Context is full */ break; } } } /* Finish processing input file */ while((c = getc(in)) != EOF) { if(isBase(c)) { /* Have read a base. Remember and overwrite earliest * value in context */ c = tolower(c); prev = dnaEquiv(context, end, k); if(k > 0) { context[end] = c; end = (end + 1) % k; } /* Calculate current base's probability */ sum += -log2((model[baseEquiv(c)][prev] + delta) / (model[4][prev] + (4 * delta))); } } /* Finished. */ return sum; } /* getAdjacencyMatrix * Determines and outputs the adjacency matrix of DNA source distances by * determining the probability of each base in the filenames supplied, based * on a Markov model of the previous k bases. Compares all files against each * other. Skips unknown characters. Case-insensitive. File errors are * non-fatal. * Returns 0 on success, 1 on allocation failure. */ int getAdjacencyMatrix(double **adj, const int k, int numfiles, char *filenames[]) { int *len = NULL; char *context = NULL; int **model = NULL; FILE *in = NULL; int i, j, c; double id; /* Preconditions */ assert(k >= 0); assert(numfiles >= 0); /* Allocate space to store context and Markov model. */ /* Space to store file lengths */ len = (int *)calloc(sizeof(int), numfiles); if(!len) { /* Couldn't allocate space for lengths */ return 1; /* failure */ } /* One extra space for context list for when k=0 */ context = (char *)calloc(sizeof(char), k+1); if(!context) { /* Couldn't allocate space for context, clean up */ free(len); return 1; /* failure */ } /* The model matrix stores * 5 rows (each of the bases plus a total) * 4^k columns per row * This comes to about 20GB when k = 15! */ model = (int **)matrix(sizeof(int), 5, 1 << (k << 1)); if(!model) { /* Couldn't allocate space for model, clean up */ free(context); free(len); return 1; /* failure */ } /* Memory has been successfully allocated. */ assert(adj && model && context); /* For each file as a model: */ for(i=0; i 15) { /* Too high or too low; quit. */ fprintf(stderr, "error: k must be between zero and 15 inclusive\n"); return 3; } --argc; ++argv; adj = (double **)matrix(sizeof(double), argc, argc); if(!adj) { /* Couldn't allocate space for adjacency matrix */ fprintf(stderr, "error: could not allocate memory\n"); return 4; } /* Call the getAdjacencyMatrix function to do all the dirty work. */ result = getAdjacencyMatrix(adj, k, argc, argv); if(result == 1) { /* Something went wrong allocating memory; quit. */ fprintf(stderr, "error: could not allocate memory\n"); } outputAdjacencyMatrix(adj, argc, argv); freeMatrix((void **)adj, argc); /* Finished. */ return 0; }