/*------------------------------------------------------------- Copyright (C) 2000 Peter Clote, Sebastian Will. All Rights Reserved. written by Peter Clote modified by Sebastian Will Permission to use, copy, modify, and distribute this software and its documentation for NON-COMMERCIAL purposes and without fee is hereby granted provided that this copyright notice appears in all copies. THE AUTHOR AND PUBLISHER MAKE NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHORS AND PUBLISHER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. -------------------------------------------------------------*/ /*--------------------------------------------------- jensenShannon.c P. Clote 19 Dec 1999 This program opens a file, computes length N of file and prints out the Jensen Shannon divergence when the file is segmented between 1,..., i*step and i*step+1,...,N. High divergence indicates that the left and right segments are more homogeneous with respect to themselves than with respect to the whole. This could be combined with an entropy plot, dinucleotide entropy plot, or pair potential plot. ----------------------------------------------------*/ #include #include #include #include #define STEP 100 // print JensenShannon divergence for multiplicities of STEP #define LOG2(x) (log(x)/log(2)) /*------- prototypes --------*/ char nucleotide(char ch); int isPurine(char ch); int isPyrimidine(char ch); double computeEntropy(int pur, int pyr, int size); int main(int argc, char *argv[]){ FILE *in; time_t time_seed = time(0); int purine,pyrimidine; int purineU,pyrimidineU,purineV,pyrimidineV; int i,j,num,n,nU,nV; // entire genome W of length n is segmented into U,V int ch; double HW,HU,HV,divergence; // entropy values /********* ERROR CHECKING in command line arguments *********/ if (argc != 2) { fprintf(stderr,"%s filename\n",argv[0]); exit(1); } if ((in = fopen(argv[1],"r")) == NULL) { fprintf(stderr,"%s not a readable file.\n", argv[1]); exit(1); } srand(time_seed); /*-------------------------------------------- Read file twice through, and each time make same random assignments to A,C,G,T for incomplete data. --------------------------------------------*/ n=nU=nV=purine=pyrimidine=purineU=pyrimidineU=purineV=pyrimidineV=0; // initialize counts to 0 while ((ch = fgetc(in)) != EOF) { char nucl=nucleotide(ch); if ( isPurine(nucl) ) purine++; else if (isPyrimidine(nucl)) pyrimidine++; if (nucl) n++; } rewind(in); srand(time_seed); /*-------------------------------------------- Reset to same random seed to make same random assignments to A,C,G,T for incomplete data. --------------------------------------------*/ HW = computeEntropy(purine,pyrimidine,n); printf("purine: %d pyrimidine: %d genome length: %d\n", purine,pyrimidine,n); printf("Entropy H(W): %lf\n",HW); // Compute JensenShannon divergence at segments // U = 1,...,i*STEP and V = i*STEP+1,....,n purineV=purine; pyrimidineV=pyrimidine; nV=n; num = (n-1)/STEP; for (i=0;i