/*------------------------------------------------------------- Copyright (C) 2000 Peter Clote, Sebastian Will. All Rights Reserved. written by Peter Clote modified by Sebastian Will Permission to use, copy, modify, and distribute this software and its documentation for NON-COMMERCIAL purposes and without fee is hereby granted provided that this copyright notice appears in all copies. THE AUTHOR AND PUBLISHER MAKE NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHORS AND PUBLISHER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. -------------------------------------------------------------*/ // ------------------------------------------------------------ // // Pointwise Mutation Simulation with Historical Alignments // // ------------------------------------------------------------ // // Apply pointwise mutations (either interactively or at random) to a // string, remember information for historically correct alignments // and print these historical alignments after each mutation // // Works by storing the mutated sequence in n many bins, where n is the length // of the original sequence + 1, such that the relation of original // to mutated sequence is remembered. // // NOTE: there are many different aproaches to solve the problem, of // remembering the historically correct alignment. Other approaches // may e.g. manipulate the alignment strings directly or remember a // mapping of the mutated sequence to the original one. Here, a // rather intuitive approach is chosen that may save some time for the // pointwise mutations over direct manipulation of the alignment. The // algorithm pays for this with a slightly more complex generation of // the usual alignment representation. // // USAGE: ptwiseMut [], // if parameter steps is omitted, we run interactively // else the parameter gives the number of steps, // and in each step a pointwise mutation may be selected at random, // where the probabilities are hard coded. // steps=-1 causes an user terminated, "infinite" run // // COMPILER: g++ -o ptwiseMut ptwiseMut.cc // // // -------------------- includes // #include // import tolower() #include // import MAXINT #include #include // use STL vectors #include // -------------------- constants // const int BUF_SIZ=256; const char GAP = '-'; // gap symbol // probabilities const double pDel = 0.0001; // deletion const double pIns = 0.001; // insertion const double pSub = 0.001; // substitution const double pTransition = (pSub*0.8); // transition const double pTransversion = (pSub*0.2); // transversion // -------------------- Global declarations // typedef vector bin_t; // a bin is a list of bases vector bins; vector cum_len; // invariant: cum_len[i] = \sum_{j=1}^n bins[i].size() char *origSequence; // original sequence int origLength; // length of original sequence int length; // length of mutated sequence int steps; // -------------------- function prototypes // // ---------- pointwise mutations void del(int); void insert(int,char); void subst(int,char); // ---------- output void printAlignment(); void printContents(); // for debugging, resp. demonstration // ---------- selection and application of mutations bool read_and_apply_mutation(int i); // for interactive mode bool select_and_apply_mutation(int i); // for non interactive mode // the last two functions return, whether a next mutation shall be applied main(int argc, char *argv[]) { // -------------------- read arguments // if (argc!=2 && argc!=3) { printf("usage: %s [] \n",argv[0]); exit(-1); } origSequence = argv[1]; bool interactive; // run interactive ? if (argc==3) { steps=atoi(argv[2]); interactive=false; // run infintely many steps, if argument non positive if (steps <= 0) steps = MAXINT; // initialize random number generator with time // to get different results for each run srand(time(0)); } else interactive=true; // -------------------- initialize // length = origLength = strlen(origSequence); // reserve and initialize space for origLength+1 entries in bins and cum_len bins.insert(bins.begin(),origLength+1,bin_t(0)); cum_len.insert(cum_len.begin(),origLength+1,0); int i; for (i=1;i<=length;i++) { bins[i].push_back(argv[1][i-1]); cum_len[i]=i; } // print short instructions if (interactive) { printf("Input commands of the form [ i | s | d | q ]\n"); printf("for (i)nsertion, (s)ubstitution, (d)eletion or (q)uit.\n\n"); } // -------------------- main loop for (int i=0; ; i++) { // print alignment and other info // if (i>0) printf("%d.\n",i); printContents(); printf("alignment:\n"); printAlignment(); printf("\n"); // ---------- choose and apply mutation bool next; if (interactive) next=read_and_apply_mutation(i); else { next=select_and_apply_mutation(i); } // do we have to quit? if (!next) { puts("quit"); break; } } // print the last alignment once again // puts("--------------------"); printAlignment(); // ---------- exit program exit(0); } // return a random real number in the range [0,1] // double RREAL() { return ((double)rand()/(double)RAND_MAX); } // select a mutation at random and apply this mutation // modified from a program of P.Clote // bool select_and_apply_mutation(int j) { if (j>=steps) return false; int k; double z; char ch; // deletion if (RREAL() < pDel){ if (length == 0) return true; k = rand()%length; del(k); } // insertion if (RREAL() < pIns){ k = rand()%(length+1); z = RREAL(); if (z<0.25) insert(k,'A'); else if (z<0.5) insert(k,'C'); else if (z<0.75) insert(k,'G'); else insert(k,'T'); // used RREAL() instead of rand() and switch // since statistical properties better } // substitution if (RREAL() < pSub){ if (length==0) return true; // a substitution may either be a transition or a transversion if (RREAL() < pTransition/pSub){ k = rand()%length; int i=0; while (k>=cum_len[i]) i++; ch = bins[i][k-cum_len[i]+bins[i].size()]; switch (ch) { case 'A' : subst(k,'G'); break; case 'C' : subst(k,'T'); break; case 'G' : subst(k,'A'); break; case 'T' : subst(k,'G'); break; default : printf("%c not nucleotide\n",ch); exit(1); } } // end of treatment of transition else { k = rand()%length; int i=0; while (k>=cum_len[i]) i++; ch = bins[i][k-cum_len[i]+bins[i].size()]; switch (ch) { case 'A' : if (RREAL() < 0.5) subst(k,'C'); else subst(k,'T'); break; case 'C' : if (RREAL() < 0.5) subst(k,'A'); else subst(k,'G'); break; case 'G' : if (RREAL() < 0.5) subst(k,'C'); else subst(k,'T'); break; case 'T' : if (RREAL() < 0.5) subst(k,'A'); else subst(k,'G'); break; default : printf("%c not nucleotide\n",ch); exit(1); } } // end of treatment of transversion } return true; } // interactively read a mutation from user and apply this mutation // bool read_and_apply_mutation(int i) { // Not every wrong input is recognized, thus we are not really fault // tolerant. Wrong input will probably cause the program to abort. int pos; char ch; char op[BUF_SIZ]; printf("> "); if (scanf("%s",op) < 1) // read operator, operator is a string return false; // check if no operator could be read switch(tolower(op[0])) { // switch over first character of op case 'q': return false; case 'i': scanf("%d %c",&pos, &ch); insert(pos,ch); break; case 'd': scanf("%d",&pos); del(pos); break; case 's': scanf("%d %c",&pos,&ch); subst(pos,ch); break; default: printf("unknown operation: %s\n",op); } return true; } void printAlignment(){ /*----------------------------------------- Original input string s0,s1,...,s{n-1} The a[i] are the bins; i.e. a[i][0],...,a[i][b[i]-1]] s0 s1 s2 ... s{n-1} a0 a1 a2 a3 an The bin a[i] consists of s{i-1} together with insertions thereafter. -------------------------------------------*/ // ------------------------------------------------------------ // print part of alignment for original sequence for (unsigned int i=0 ; i < bins[0].size() ; i++) putchar(GAP); // print spaces for bin before s0 printf("%c", origSequence[0]); for (int i=1;i<=origLength;i++){ for (unsigned int j=1 ; j < bins[i].size() ; j++) putchar(GAP); printf("%c", origSequence[i]); } printf("\n"); // ------------------------------------------------------------ // print part of alignment for mutated sequence for ( bin_t::iterator j=bins[0].begin() ; j != bins[0].end() ; ++j ) printf("%c",*j); for ( int i=1; i<=origLength ; i++) { if (bins[i].size() == 0) printf("-"); else for ( bin_t::iterator j=bins[i].begin() ; j != bins[i].end() ; ++j ) printf("%c",*j); } printf("\n"); } void printContents() { printf("cum_len:\t"); for (int i=0;i<=origLength;i++) printf("%3d",cum_len[i]); printf("\n"); // show the bins -- for reasons of illustration of the algorithm printf("\nbins:\n"); unsigned int maximum=0; for (vector::iterator j=bins.begin() ; j!=bins.end(); ++j) { maximum = max(maximum, j->size()); printf("- "); } printf("\n"); for (unsigned int i=0; i::iterator j=bins.begin() ; j!=bins.end(); ++j) if (j->size() > (unsigned) i) printf("%c ",(*j)[i]); else printf(" "); printf("\n"); } printf("\n"); } void del(int k){ assert(k >= 0); assert(k < length); // find bin that contains k, i.e. the leftmost bin i, where // cum_len[i] > k int i; for(i=0; cum_len[i] <= k ; i++); // compute the position in bins[i] of the kth entry in all bins int pos = bins[i].size() - (cum_len[i] - k); // remove the entry at position pos from bins[i] bins[i].erase( bins[i].begin()+pos ); // update length and the array cum_len length--; for (int j=i; j<=origLength; j++) cum_len[j]--; } void insert(int k, char ch){ /*-------------------------- Current string r0 ... r{length-1}, and 0 <= k <= length. Insert before the k-th symbol of r if k < length, and insert after r{length-1} if k=length. ---------------------------*/ assert(k>=0); assert(k<=length); int i=0; for(i=0; cum_len[i] < k ; i++); // find least i with k <= cum_len[i]; such must exist // compute position, where we have to insert int pos = bins[i].size() - (cum_len[i] - k); bins[i].insert( bins[i].begin()+pos, ch ); // update length and the array cum_len length++; for (int j=i;j<=origLength;j++) cum_len[j]++; } void subst(int k, char ch){ assert(k>=0); assert(k= cum_len[i] ; i++); // find least i with k < cum_len[i] // calculate position int pos = bins[i].size() - (cum_len[i] - k); bins[i][pos] = ch; }