/*------------------------------------------------------------- Copyright (C) 2000 Peter Clote. All Rights Reserved. Permission to use, copy, modify, and distribute this software and its documentation for NON-COMMERCIAL purposes and without fee is hereby granted provided that this copyright notice appears in all copies. THE AUTHOR AND PUBLISHER MAKE NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHORS AND PUBLISHER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. -------------------------------------------------------------*/ /************************************************* Program: gotoh.c Peter Clote, 24 April 1998 Program for sequence alignment, assuming that sequences s,t both begin and end with the same character, which can artificially be ensured by giving dummy start and stop characters (such as #, but not $ or * because the shell interprets these). This is the final, corrected version, which rectifies the initialization errors in the presentation of Nei and Waterman by using P(0,j) = Q(i,0) = INF. The traceback path is determined by choosing that direction, for which D(i-1,j-1), D(i-1,j) and D(i,j-1) is minimum, i.e. for which _ | | D(i-1,j-1) + GAMMA(i,j) (diagonal) D(i,j) = MIN of | P(i,j) (up) | Q(i,j) (left) |_ is a minimum. *************************************************/ #include #include #include #define SPACE ' ' #define MIN2(x,y) ( (x) <= (y) ? (x) : (y) ) #define MAX2(x,y) ( (x) <= (y) ? (y) : (x) ) #define MIN(x,y,z) MIN2(MIN2(x,y),z) #define MISMATCH 3 /* cost of mismatched letters */ #define D(i,j) d[(i)*(m+1) + (j)] #define P(i,j) p[(i)*(m+1) + (j)] #define Q(i,j) q[(i)*(m+1) + (j)] #define INF 100 #define U 2 #define V 2 #define W(i) U*(i)+V /* cost of gap of length i, i>0 */ #define S(i) s[(i)-1] #define T(i) t[(i)-1] /* D is integer matrix with n+1 rows and m+1 cols, where n = length of s, m = length of t */ #define GAMMA(i,j) ( S(i) == T(j) ? 0 : MISMATCH ) /*---------------------------------------------------------- Triplet is a triple (d,p,q) of pointers to the arrays D,P,Q used in Gotoh's algorithm. -----------------------------------------------------------*/ struct pointer_triplet { int *first; int *second; int *third; }; typedef struct pointer_triplet Triple; void error ( char *s ); /* prototype of error function */ void error ( char *s ) { fprintf(stderr,"%s\n",s); exit(1); } Triple dist ( char *s, char *t ) { int i,j,k,n = strlen(s), m = strlen(t); Triple val; int *d, *p, *q; if ( s[0] != t[0] || s[n-1] != t[m-1] ) error("Strings don't have same beginning or ending character."); /******* allocate arrays *******************/ d = (int *) calloc((n+1)*(m+1), sizeof(int)); p = (int *) calloc((n+1)*(m+1), sizeof(int)); q = (int *) calloc((n+1)*(m+1), sizeof(int)); /******** initialize border values of matrices */ D(0,0) = 0; P(0,0) = Q(0,0) = 0; for (i=1;i<=n;i++) D(i,0) = W(i); for (j=1;j<=m;j++) D(0,j) = W(j); for (j=1;j<=m;j++) P(0,j) = INF; for (i=1;i<=n;i++) Q(i,0) = INF; /******** compute interior values using recurrence relation. This loop is really of the form for k = 2 to n+m for i,j satisfying i+j = k ..... *************************************************************/ for (k=2;k<=n+m;k++) for (i=1;i<=MIN2(n,k-1);i++) { j = k-i; if ( j > m ) continue; /* index out of bounds */ P(i,j) = MIN2( D(i-1,j) + W(1), P(i-1,j) + U ); Q(i,j) = MIN2( D(i,j-1) + W(1), Q(i,j-1) + U ); D(i,j) = MIN( D(i-1,j-1) + GAMMA(i,j), P(i,j), Q(i,j) ); } val.first = d; val.second = p; val.third = q; return(val); } char *backtrack ( Triple val, char *s, char *t, int n, int m ) { /* compute the directions, starting from the LAST character of sequence s,t in order to determine the best alignment */ int i=n,j=m; char c, *r, *dir; int *d, *p, *q; d = val.first; p = val.second; q = val.third; r = (char *) calloc(n+m+1,sizeof(char)); dir = r+n+m; /* dir (direction) points to last cell of r */ *dir = '\0'; /* last cell in r is null character */ while ( i != 1 && j != 1 ) { if ( D(i,j) == P(i,j) ) { i--; *(--dir) = 'u'; /* up */ } else if ( D(i,j) == D(i-1,j-1) + GAMMA(i,j) ) { i--; j--; *(--dir) = 'd'; /* diagonal */ } else if ( D(i,j) == Q(i,j) ) { j--; *(--dir) = 'l'; /* left */ } } while ( i != 1 ) { i--; *(--dir) = 'u'; /* up */ } while ( j != 1 ) { j--; *(--dir) = 'l'; /* left */ } return(dir); } void display ( Triple val, char *s, char *t ) { /* display the distance matrix d(i,j) between strings s,t where 0<=i<=n = strlen(s) and 0<=j<=m = strlen(t) */ int i,j, n = strlen(s), m = strlen(t); int *d, *p, *q; d = val.first; p = val.second; q = val.third; printf("Distance Matrix for alignment of %s with %s.\n\n",s,t); /****************** print letters of string t ***********/ printf("%5c",SPACE); printf("%5c",SPACE); for (j=1;j<=m;j++) printf("%5c",T(j)); printf("\n"); /********************* print D(0,0),...,D(0,m) ************/ printf("D matrix\n\n"); printf("%5c",SPACE); for (j=0;j<=m;j++) printf("%5d",D(0,j)); printf("\n"); for (i=1;i<=n;i++) { printf("%5c",S(i)); for (j=0;j<=m;j++) printf("%5d",D(i,j)); printf("\n"); } /********************* print P(0,0),...,P(0,m) ************/ printf("P matrix\n\n"); printf("%5c",SPACE); for (j=0;j<=m;j++) printf("%5d",P(0,j)); printf("\n"); for (i=1;i<=n;i++) { printf("%5c",S(i)); for (j=0;j<=m;j++) printf("%5d",P(i,j)); printf("\n\n\n"); } /********************* print Q(0,0),...,Q(0,m) ************/ printf("Q matrix\n\n"); printf("%5c",SPACE); for (j=0;j<=m;j++) printf("%5d",Q(0,j)); printf("\n"); for (i=1;i<=n;i++) { printf("%5c",S(i)); for (j=0;j<=m;j++) printf("%5d",Q(i,j)); printf("\n"); } } void alignment ( char *direction, char *s, char *t ) { /** Computes strings s0,t0 of length MAX(len(s),len(t)), where s0 [resp t0] is s [resp t] possibly with some interspersed '-' marks **/ int n = strlen(s), m = strlen(t), max = n+m; int i,j,k; char *s0, *t0; s0 = (char *) calloc(max,sizeof(char)); t0 = (char *) calloc(max,sizeof(char)); for (k=0;k