/*------------------------------------------------------------- Copyright (C) 2000 Peter Clote. All Rights Reserved. Permission to use, copy, modify, and distribute this software and its documentation for NON-COMMERCIAL purposes and without fee is hereby granted provided that this copyright notice appears in all copies. THE AUTHOR AND PUBLISHER MAKE NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHORS AND PUBLISHER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. -------------------------------------------------------------*/ /****************************************** entropyFile.c P. Clote, 7 Sept 1998 Program computes entropy (single and dinucleotide) in an entire genome, rather than in windows of a genome. This program handles incomplete data (eg R,Y etc). After compiling to get a.out, to run, type a.out Methanococcus where Methanococcus is the M.jannaschii genome. ******************************************/ #include #include #define lg(x) ( ((x) == 0) ? 0 : (log(x)/log(2)) ) #define NumNucl 10 enum Nucl { A,C,G,T,N,S,K,W,R,Y }; /* type def of 10 nucleotides possible in data */ typedef int Paircount[NumNucl][NumNucl]; typedef double PairACGTcount[4][4]; enum Nucl index (char ch) /*--------------------------------------------------- Return the index in enum Nucl (0 to 9) of the nucleotide read. ----------------------------------------------------*/ { enum Nucl index1; switch ( ch ) { case 'A' : index1 = A; break; case 'C' : index1 = C; break; case 'G' : index1 = G; break; case 'T' : index1 = T; break; case 'N' : index1 = N; break; case 'S' : index1 = S; break; case 'K' : index1 = K; break; case 'W' : index1 = W; break; case 'R' : index1 = R; break; case 'Y' : index1 = Y; break; default : printf("%c\n",ch); error("Bad Data"); break; } return index1; } /* end of index */ enum Nucl indexPlus (char ch, double *a, double *c, double *g, double *t) /*--------------------------------------------------- Return the index in enum Nucl (0 to 9) of the nucleotide read. Use pointers to update (PLUS) the counters for nucleotides. CAREFUL - nucleotide counters are double, rather than integer, since the indeterminate N,S,R,Y, etc. will contribute fractional values. Below there is a switch statements to determine the single nucleotides for the character ch. ----------------------------------------------------*/ { enum Nucl index1; switch ( ch ) { case 'A' : (*a)++; index1 = A; break; case 'C' : (*c)++; index1 = C; break; case 'G' : (*g)++; index1 = G; break; case 'T' : (*t)++; index1 = T; break; case 'N' : /* A or C or G or T */ (*a) = (*a) + 0.25; (*c) = (*c) + 0.25; (*g) = (*g) + 0.25; (*t) = (*t) + 0.25; index1 = N; break; case 'S' : /* C or G */ (*c) = (*c) + 0.5; (*g) = (*g) + 0.5; index1 = S; break; case 'K' : /* G or T */ (*g) = (*g) + 0.5; (*t) = (*t) + 0.5; index1 = K; break; case 'W' : /* A or T */ (*a) = (*a) + 0.5; (*t) = (*t) + 0.5; index1 = W; break; case 'R' : /* A or G */ (*a) = (*a) + 0.5; (*g) = (*g) + 0.5; index1 = R; break; case 'Y' : /* C or T */ (*c) = (*c) + 0.5; (*t) = (*t) + 0.5; index1 = Y; break; default : printf("%c\n",ch); error("Bad Data"); break; } return index1; } /* end of indexPlus */ enum Nucl indexMinus (char ch, double *a, double *c, double *g, double *t) /*--------------------------------------------------- Return the index in enum Nucl (0 to 9) of the nucleotide read. Use pointers to update (MINUS) the counters for nucleotides. CAREFUL - nucleotide counters are double, rather than integer, since the indeterminate N,S,R,Y, etc. will contribute fractional values. Below there is a switch statements to determine the single nucleotides for the character ch. ----------------------------------------------------*/ { /* remove contribution for nucleotide freq for ch */ enum Nucl index1; switch ( ch ) { case 'A' : (*a)--; index1 = A; break; case 'C' : (*c)--; index1 = C; break; case 'G' : (*g)--; index1 = G; break; case 'T' : (*t)--; index1 = T; break; case 'N' : /* A or C or G or T */ (*a) = (*a) - 0.25; (*c) = (*c) - 0.25; (*g) = (*g) - 0.25; (*t) = (*t) - 0.25; index1 = N; break; case 'S' : /* C or G */ (*c) = (*c) - 0.5; (*g) = (*g) - 0.5; index1 = S; break; case 'K' : /* G or T */ (*g) = (*g) - 0.5; (*t) = (*t) - 0.5; index1 = K; break; case 'W' : /* A or T */ (*a) = (*a) - 0.5; (*t) = (*t) - 0.5; index1 = W; break; case 'R' : /* A or G */ (*a) = (*a) - 0.5; (*g) = (*g) - 0.5; index1 = R; break; case 'Y' : /* C or T */ (*c) = (*c) - 0.5; (*t) = (*t) - 0.5; index1 = Y; break; default : printf("%c\n",ch); error("Bad Data"); break; } return index1; } /* end indexMinus */ void computePair ( Paircount pair, PairACGTcount aux ) /*---------------------------------------------- This computes the contribution to dinucleotide frequency, using the uncertain data values of nucleotides N = A or C or G or T S = C or G K = G or T W = A or T R = A or G Y = C or T ----------------------------------------------*/ { /* only A,C,G,T contributions */ int i,j; /* initialize aux */ for (i=0;i<4;i++) for (j=0;j<4;j++) aux[i][j] = 0; /** A is A N W R A is A N W R **/ aux[A][A] = pair[A][A] + 0.25*pair[A][N] +0.5*pair[A][W] + 0.5*pair[A][R] +0.25*pair[N][A] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][W] + (0.25*0.25)*pair[N][R] +0.5*pair[W][A] + (0.5*0.25)*pair[W][N] +(0.5*0.5)*pair[W][W] + (0.5*0.5)*pair[W][R] +0.5*pair[R][A] + (0.5*0.25)*pair[R][N] +(0.5*0.5)*pair[R][W] + (0.5*0.5)*pair[R][R]; /** A is A N W R C is C N S Y **/ aux[A][C] = pair[A][C] + 0.25*pair[A][N] +0.5*pair[A][S] + 0.5*pair[A][Y] +(0.25*0.5)*pair[N][C] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][S] + (0.25*0.5)*pair[N][Y] +0.5*pair[W][C] + (0.5*0.25)*pair[W][N] +(0.5*0.5)*pair[W][S] + (0.5*0.5)*pair[W][Y] +0.5*pair[R][C] + (0.5*0.25)*pair[R][N] +(0.5*0.5)*pair[R][S] + (0.5*0.5)*pair[R][Y]; /** A is A N W R G is G N K R **/ aux[A][G] = pair[A][G] + 0.25*pair[A][N] +0.5*pair[A][K] + 0.5*pair[A][R] +(0.25*0.5)*pair[N][G] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][K] + (0.25*0.5)*pair[N][R] +0.5*pair[W][G] + (0.5*0.25)*pair[W][N] +(0.5*0.5)*pair[W][K] + (0.5*0.5)*pair[W][R] +0.5*pair[R][G] + (0.5*0.25)*pair[R][N] +(0.5*0.5)*pair[R][K] + (0.5*0.5)*pair[R][R]; /** A is A N W R T is T N K W **/ aux[A][T] = pair[A][T] + 0.25*pair[A][N] +0.5*pair[A][K] + 0.5*pair[A][W] +(0.25*0.5)*pair[N][T] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][K] + (0.25*0.5)*pair[N][W] +0.5*pair[W][T] + (0.5*0.25)*pair[W][N] +(0.5*0.5)*pair[W][K] + (0.5*0.5)*pair[W][W] +0.5*pair[R][T] + (0.5*0.25)*pair[R][N] +(0.5*0.5)*pair[R][K] + (0.5*0.5)*pair[R][W]; /** C is C N S Y A is A N W R **/ aux[C][A] = pair[C][A] + 0.25*pair[C][N] +0.5*pair[C][W] + 0.5*pair[C][R] +0.25*pair[N][A] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][W] + (0.25*0.25)*pair[N][R] +0.5*pair[S][A] + (0.5*0.25)*pair[S][N] +(0.5*0.5)*pair[S][W] + (0.5*0.5)*pair[S][R] +0.5*pair[Y][A] + (0.5*0.25)*pair[Y][N] +(0.5*0.5)*pair[Y][W] + (0.5*0.5)*pair[Y][R]; /** C is C N S Y C is C N S Y **/ aux[C][C] = pair[C][C] + 0.25*pair[C][N] +0.5*pair[C][S] + 0.5*pair[C][Y] +(0.25*0.5)*pair[N][C] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][S] + (0.25*0.5)*pair[N][Y] +0.5*pair[S][C] + (0.5*0.25)*pair[S][N] +(0.5*0.5)*pair[S][S] + (0.5*0.5)*pair[S][Y] +0.5*pair[Y][C] + (0.5*0.25)*pair[Y][N] +(0.5*0.5)*pair[Y][S] + (0.5*0.5)*pair[Y][Y]; /** C is C N S Y G is G N K R **/ aux[C][G] = pair[C][G] + 0.25*pair[C][N] +0.5*pair[C][K] + 0.5*pair[C][R] +(0.25*0.5)*pair[N][G] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][K] + (0.25*0.5)*pair[N][R] +0.5*pair[S][G] + (0.5*0.25)*pair[S][N] +(0.5*0.5)*pair[S][K] + (0.5*0.5)*pair[S][R] +0.5*pair[Y][G] + (0.5*0.25)*pair[Y][N] +(0.5*0.5)*pair[Y][K] + (0.5*0.5)*pair[Y][R]; /** C is C N S Y T is T N K W **/ aux[C][T] = pair[C][T] + 0.25*pair[C][N] +0.5*pair[C][K] + 0.5*pair[C][W] +(0.25*0.5)*pair[N][T] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][K] + (0.25*0.5)*pair[N][W] +0.5*pair[S][T] + (0.5*0.25)*pair[S][N] +(0.5*0.5)*pair[S][K] + (0.5*0.5)*pair[S][W] +0.5*pair[Y][T] + (0.5*0.25)*pair[Y][N] +(0.5*0.5)*pair[Y][K] + (0.5*0.5)*pair[Y][W]; /** G is G N S R A is A N W R **/ aux[G][A] = pair[G][A] + 0.25*pair[G][N] +0.5*pair[G][W] + 0.5*pair[G][R] +0.25*pair[N][A] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][W] + (0.25*0.25)*pair[N][R] +0.5*pair[S][A] + (0.5*0.25)*pair[S][N] +(0.5*0.5)*pair[S][W] + (0.5*0.5)*pair[S][R] +0.5*pair[R][A] + (0.5*0.25)*pair[R][N] +(0.5*0.5)*pair[R][W] + (0.5*0.5)*pair[R][R]; /** G is G N S R C is C N S Y **/ aux[G][C] = pair[G][C] + 0.25*pair[G][N] +0.5*pair[G][S] + 0.5*pair[G][Y] +(0.25*0.5)*pair[N][C] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][S] + (0.25*0.5)*pair[N][Y] +0.5*pair[S][C] + (0.5*0.25)*pair[S][N] +(0.5*0.5)*pair[S][S] + (0.5*0.5)*pair[S][Y] +0.5*pair[R][C] + (0.5*0.25)*pair[R][N] +(0.5*0.5)*pair[R][S] + (0.5*0.5)*pair[R][Y]; /** G is G N S R G is G N K R **/ aux[G][G] = pair[G][G] + 0.25*pair[G][N] +0.5*pair[G][K] + 0.5*pair[G][R] +(0.25*0.5)*pair[N][G] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][K] + (0.25*0.5)*pair[N][R] +0.5*pair[S][G] + (0.5*0.25)*pair[S][N] +(0.5*0.5)*pair[S][K] + (0.5*0.5)*pair[S][R] +0.5*pair[R][G] + (0.5*0.25)*pair[R][N] +(0.5*0.5)*pair[R][K] + (0.5*0.5)*pair[R][R]; /** G is G N S R T is T N K W **/ aux[G][T] = pair[G][T] + 0.25*pair[G][N] +0.5*pair[G][K] + 0.5*pair[G][W] +(0.25*0.5)*pair[N][T] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][K] + (0.25*0.5)*pair[N][W] +0.5*pair[S][T] + (0.5*0.25)*pair[S][N] +(0.5*0.5)*pair[S][K] + (0.5*0.5)*pair[S][W] +0.5*pair[R][T] + (0.5*0.25)*pair[R][N] +(0.5*0.5)*pair[R][K] + (0.5*0.5)*pair[R][W]; /** T is T N K W A is A N W R **/ aux[T][A] = pair[T][A] + 0.25*pair[T][N] +0.5*pair[T][W] + 0.5*pair[T][R] +0.25*pair[N][A] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][W] + (0.25*0.25)*pair[N][R] +0.5*pair[K][A] + (0.5*0.25)*pair[K][N] +(0.5*0.5)*pair[K][W] + (0.5*0.5)*pair[K][R] +0.5*pair[W][A] + (0.5*0.25)*pair[W][N] +(0.5*0.5)*pair[W][W] + (0.5*0.5)*pair[W][R]; /** T is T N K W C is C N S Y **/ aux[T][C] = pair[T][C] + 0.25*pair[T][N] +0.5*pair[T][S] + 0.5*pair[T][Y] +(0.25*0.5)*pair[N][C] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][S] + (0.25*0.5)*pair[N][Y] +0.5*pair[K][C] + (0.5*0.25)*pair[K][N] +(0.5*0.5)*pair[K][S] + (0.5*0.5)*pair[K][Y] +0.5*pair[W][C] + (0.5*0.25)*pair[W][N] +(0.5*0.5)*pair[W][S] + (0.5*0.5)*pair[W][Y]; /** T is T N K W G is G N K R **/ aux[T][G] = pair[T][G] + 0.25*pair[T][N] +0.5*pair[T][K] + 0.5*pair[T][R] +(0.25*0.5)*pair[N][G] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][K] + (0.25*0.5)*pair[N][R] +0.5*pair[K][G] + (0.5*0.25)*pair[K][N] +(0.5*0.5)*pair[K][K] + (0.5*0.5)*pair[K][R] +0.5*pair[W][G] + (0.5*0.25)*pair[W][N] +(0.5*0.5)*pair[W][K] + (0.5*0.5)*pair[W][R]; /** T is T N K W T is T N K W **/ aux[T][T] = pair[T][T] + 0.25*pair[T][N] +0.5*pair[T][K] + 0.5*pair[T][W] +(0.25*0.5)*pair[N][T] + (0.25*0.25)*pair[N][N] +(0.25*0.5)*pair[N][K] + (0.25*0.5)*pair[N][W] +0.5*pair[K][T] + (0.5*0.25)*pair[K][N] +(0.5*0.5)*pair[K][K] + (0.5*0.5)*pair[K][W] +0.5*pair[W][T] + (0.5*0.25)*pair[W][N] +(0.5*0.5)*pair[W][K] + (0.5*0.5)*pair[W][W]; } /* end of computePair */ /*-------- main program ---------*/ main(int argc, char *argv[]) { FILE *in; /* function prototypes */ enum Nucl index (char ch); enum Nucl indexPlus (char ch, double *a, double *c, double *g, double *t); enum Nucl indexMinus (char ch, double *a, double *c, double *g, double *t); void computePair ( Paircount pair, PairACGTcount ); void printPair ( FILE *out, Paircount pair ); void printACGTPair ( FILE *out,PairACGTcount pairACGT ); /* These 2 printing routines for debugging */ /* variables */ int n=0,i,j; int num=0; /* num of nucleotides in file */ double a=0,c=0,g=0,t=0; /* a,c,g,t nucleotides. These are double, since there are contributions from uncertain data. */ char ch; char first,second; int index1,index2; double ent1, ent2; /*** entropy1 and entropy2 */ double fa,fc,fg,ft; /*** frequency of a,c,g,t */ Paircount pair; PairACGTcount pairACGT; /*------------------------------------- pair[][] accounts for ALL nucleotides (eg R,Y,etc) while pairACGT = computerPair( pair ) and computes the fractional contribution to A,C,G,T for the incomplete data, such as R,Y, etc. This is used for dinucleotide counts and entropy2. -------------------------------------*/ double freq[4][4]; /*** for dinucleotide frequencies */ int error(char *); /** error handling */ /**** Error handling for input file ****/ if (argc != 2) { fprintf(stderr,"%s in\n",argv[0]); exit(1); } /***** Initialization of I/O files and pair, pairACGT array **/ if ( (in = fopen(argv[1], "r")) == NULL) { fprintf(stderr, "%s: cannot open %s for reading\n", argv[0], argv[1]); exit(1); } for (i=0;i