/* * CSSE, Monash University, Clayton, Victoria 3800, Australia. * CSE2304 Prac 3, Group A * Semester 1, May 2003 * Sample Solution, thanks to O.R. * * dna k [fileName] * Displays the probability of each base in the input file (filename) or * standard input using the estimate of a Markov model of order k. */ #include #include #include #include #include /* For convenience; specifies how each probability is output. */ /* Shortest float representation, comma-separated */ #define OUTPUT_FORMAT "%g, " /* Remove final comma. (\010 = backspace) */ #define OUTPUT_END "\010\010 \010\n" /* Temporary file for caching text from stdin */ #define TEMP_FILE ".bases.temp" /* delta, used for probability estimation */ #define delta 0.5 /* True if the character x is a valid DNA base */ #define isBase(x) (x == 'a'\ || x == 'A'\ || x == 'c'\ || x == 'C'\ || x == 'g'\ || x == 'G'\ || x == 't'\ || x == 'T') /* Maps the DNA base x to a number: * a == 0 * c == 1 * g == 2 * t == 3 */ #define baseEquiv(x) ((x)<'d'?((x)=='a'?0:1):((x)=='g'?2:3)) /* dnaEquiv * Maps context, an array of DNA bases, to a unique number. Used to determine * the appropriate array index for a context. start is used to specify the * index of the first base, and len provides the array bound. From start * reads all bases to the end of the array, then wraps around and reads those * remaining. * Returns the unique numeric representation. */ unsigned long dnaEquiv(char *context, int start, int len) { unsigned long c = 0; int n = 0; /* Sanity-check parameters */ assert(0 <= start && start <= len && len < 16); while(n++ < len) { /* Shift current value left by two bits * to make room for next */ c <<= 2; c += baseEquiv(context[start]); /* Move through array and wrap at end */ start = (start + 1) % len; } return c; } /* matrix * Allocates space for an arbitrary two-dimensional array. * Each element is initialised to 0. * Returns NULL on failure. * Memory must later be freed using freeMatrix. */ void **matrix(size_t size, int rows, int cols) { void **t; int i; /* Require positive input */ assert(size > 0 && rows > 0 && cols > 0); /* Allocate first dimension (rows) */ if(!(t = calloc(sizeof(void *), rows))) /* Handle failure */ return NULL; /* Allocate each row */ for(i=0; i 0 then fill context array */ if(k > 0) { while((c = getc(in)) != EOF) { if(isBase(c)) { /* Have read a base */ /* Store in context and move on */ c = tolower(c); if(out) { /* write to bases output file */ putc(c, out); } context[end] = c; end = (end + 1) % k; if(end == 0) { /* Context is full */ break; } } } } /* Finish processing input file */ while((c = getc(in)) != EOF) { if(isBase(c)) { /* Have read a base. Remember and overwrite earliest * value in context */ c = tolower(c); if(out) { /* write to bases output file */ putc(c, out); } prev = dnaEquiv(context, end, k); if(k > 0) { context[end] = c; end = (end + 1) % k; } /* Add context/base sequence to our Markov model */ model[baseEquiv(c)][prev]++; } } /* Sum columns to determine totals */ for(i=0; i<(1 << (k << 1)); i++) { model[4][i] = model[0][i] + model[1][i] + model[2][i] + model[3][i]; } } /* outputProbabilities * Reads a file of DNA bases and prints the probability of each based on the * supplied Markov model. * Requires that * in is a file open for reading * context is a character array of capacity k+1 * model is a matrix with 5 rows and 4^k columns */ void outputProbabilities(const int k, FILE *in, char *context, int **model) { int end = 0, prev; int c; /* If |context| > 0 then fill context array */ if(k > 0) { while((c = getc(in)) != EOF) { if(isBase(c)) { /* Have read a base */ /* Not enough context; output default */ /* probability */ printf(OUTPUT_FORMAT, 0.25); /* Store in context and move on */ c = tolower(c); context[end] = c; end = (end + 1) % k; if(end == 0) /* Context is full */ break; } } } /* Finish processing input file */ while((c = getc(in)) != EOF) { if(isBase(c)) { /* Have read a base. Remember and overwrite earliest * value in context */ c = tolower(c); prev = dnaEquiv(context, end, k); if(k > 0) { context[end] = c; end = (end + 1) % k; } /* Calculate and output current base's probability */ printf(OUTPUT_FORMAT, (model[baseEquiv(c)][prev] + delta) / (model[4][prev] + (4 * delta))); } } /* Finished. Print newline. */ printf(OUTPUT_END); } /* dna * Determines and outputs the probability of each base in a DNA input * sequence, based on a Markov model of the previous k bases. Reads * characters from the file in, which must be open for reading. Skips unknown * characters. Case-insensitive. * Returns 0 on success, 1 on allocation failure, 2 on file error. */ int dna(const int k, FILE *in) { char *context = NULL; int **model; FILE *out = NULL; /* Preconditions */ assert(k >= 0); assert(in != NULL); /* Setup */ if(in == stdin) { /* Need to make a copy of the input to use later */ if(!(out = fopen(TEMP_FILE, "w"))) { return 2; /* failure */ } } /* Allocate space to store context and Markov model. */ /* One extra space for context list for when k=0 */ context = (char *)calloc(sizeof(int), k+1); if(!context) { /* Couldn't allocate space for context, clean up */ if(out) fclose(out); return 1; /* failure */ } /* The model matrix stores * 5 rows (each of the bases plus a total) * 4^k columns per row * This comes to about 20GB when k = 15! */ model = (int **)matrix(sizeof(int *), 5, 1 << (k << 1)); if(!model) { /* Couldn't allocate space for model, clean up */ if(out) fclose(out); free(context); return 1; /* failure */ } /* Memory has been successfully allocated. */ assert(model && context); /* Read the Markov model from the input file */ getModel(k, in, (in == stdin?out:NULL), context, model); if(out) { /* Use newly created file */ fclose(out); if(!(in = fopen(".bases.temp", "r"))) { /* Can't open for reading */ freeMatrix((void **)model, 5, 1 << (k << 1)); free(context); return 2; } } else { /* Return to the beginning of the input file */ rewind(in); } /* Read each base from the input file and display its probability */ outputProbabilities(k, in, context, model); /* All done. Clean up. */ freeMatrix((void **)model, 5, 1 << (k << 1)); free(context); if(out) { fclose(in); } return 0; /* success */ } /* main * Error-checks parameters and handles error messages. */ int main(int argc, char *argv[]) { int k, result; FILE *source = stdin; /* Use stdin if no file supplied */ /* Check for required parameters */ if(argc < 2) { /* Not enough; output a usage message and quit. */ fprintf(stderr, "usage: %s k [fileName]\n", argv[0]); return 1; } /* Read k as a number */ if(sscanf(argv[1], "%d", &k) < 1) { /* Could not translate k to an integer; quit. */ fprintf(stderr, "error: k must be an integer\n"); return 2; } /* Check limits of k */ if(k < 0 || k > 15) { /* Too high or too low; quit. */ fprintf(stderr, "error: k must be between zero and 15 inclusive\n"); return 3; } /* Look for filename */ if(argc > 2) { /* Found one. Try opening it. */ if(!(source = fopen(argv[2], "r"))) { /* No luck; quit. */ fprintf(stderr, "error: could not open file '%s'\n", argv[2]); return 4; } } /* Call the dna function to do all the dirty work. */ result = dna(k, source); if(result == 1) { /* Something went wrong allocating memory; quit. */ fprintf(stderr, "error: could not allocate memory\n"); } else if(result == 2) { /* Temporary file could not be opened */ fprintf(stderr, "error: could not create temporary file\n"); } /* Close any opened files */ if(source != stdin) fclose(source); /* Finished. */ return 0; }