?? pfilt.c
字號:
/* pfilt - remove various non-globular/biased regions from a FASTA file *//* V1.2 *//* Author: David T. Jones, Bioinformatics Unit, University College London, March 2002 *//* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details *//* Currently removes transmembrane segments, coiled-coil and low-complexity regions. It also performs constrained filtering of biased-composition regions */#include <stdio.h>#include <stdlib.h>#include <ctype.h>#include <math.h>#include <string.h>#define MAXSEQLEN 65536/* CWIN = complexity window */#define CWIN 12/* FWIN = frequency window - N.B. aamean/aasd values assume FWIN=100! */#define FWIN 100#define FALSE 0#define TRUE 1int tmfilt = TRUE, coilfilt = TRUE, biasfilt = FALSE, complexfilt = TRUE;const char *rescodes = "ARNDCQEGHILKMFPSTWYVBZX";/* Scaled log-likelihood ratios for coiled-coil heptat repeat */const float ccoilmat[23][7] ={ {249, 310, 74, 797, -713, 235, -102}, {13, 389, 571, -2171, 511, 696, 611}, {207, 520, 768, -1624, 502, 887, 725}, {-2688, 743, 498, -1703, -409, 458, 337}, {-85, -6214, -954, -820, -1980, -839, -2538}, {-1167, 828, 845, -209, 953, 767, 949}, {-1269, 1209, 1097, -236, 1582, 1006, 1338}, {-2476, -1537, -839, -2198, -1877, -1002, -2079}, {-527, -436, -537, -171, -1180, -492, -926}, {878, -1343, -1064, -71, -911, -820, -1241}, {1097, -1313, -1002, 1348, -673, -665, -576}, {209, 785, 597, -492, 739, 522, 706}, {770, -502, -816, 365, -499, -783, -562}, {-713, -2590, -939, -447, -2079, -2513, -3270}, {-5521, -2225, -4017, -5115, -4605, -5521, -4961}, {-1102, -283, -72, -858, -309, -221, -657}, {-1624, -610, -435, -385, -99, -441, -213}, {-2718, -2748, -2733, -291, -5115, -2162, -4268}, {276, -2748, -2513, 422, -1589, -2137, -2343}, {421, -736, -1049, -119, -1251, -1049, -1016}, {-431, 638, 642, -1663, 147, 695, 549}, {-1217, 1036, 979, -223, 1316, 894, 1162}, {0, 0, 0, 0, 0, 0, 0}};/* Sample Means for proteins < 100 aa long */const float aamean[20] ={ 7.38, 5.76, 4.32, 4.69, 2.84, 3.73, 6.08, 6.19, 2.10, 6.34, 9.33, 7.20, 2.67, 4.11, 4.35, 6.67, 5.13, 1.25, 3.21, 6.63};/* Sample Standard Deviations for proteins < 100 aa long */const float aasd[20] ={ 4.00, 3.89, 2.39, 2.60, 3.59, 2.49, 3.33, 3.35, 1.95, 3.50, 4.11, 4.19, 1.69, 2.60, 2.85, 3.23, 2.49, 1.30, 2.12, 2.91};/* Sample Means for 100-residue windows */const float aamean100[20] ={ 7.44,5.08,4.69,5.36,1.54,3.93,6.24,6.34,2.24,6.09, 9.72, 6.00,2.39,4.30,4.69,7.23,5.61,1.25,3.31,6.53};/* Sample Standard Deviations for 100-residue windows */const float aasd100[20] ={ 4.02,3.05,2.87,2.71,1.88,2.63,3.46,3.45,1.79,3.19, 3.77,3.64,1.71,2.62,3.00,3.63,2.83,1.32,2.18,2.92};/* MEMSAT's TM-Helix-Middle Propensities */const int transmemsc[23] ={ 459, -2588, -1583, -2519, -1339, -1396, -2419, -30, -1859, 1146, 730, -3645, 354, 468, -181, -523, -367, 1069, -83, 783, -2051, -1908, 0};/* Dump a rude message to standard error and exit */void fail(char *errstr){ fprintf(stderr, "\n*** %s\n\n", errstr); exit(-1);}/* Convert AA letter to numeric code (0-22) */int aanum(int ch){ static int aacvs[] = { 999, 0, 20, 4, 3, 6, 13, 7, 8, 9, 22, 11, 10, 12, 2, 22, 14, 5, 1, 15, 16, 22, 19, 17, 22, 18, 21 }; return (isalpha(ch) ? aacvs[ch & 31] : 22);}/* Actually do the filtering on a a sequence */void filtseq(char *desc, char *seq, int seqlen){ int i, j, k, n, l, tot, aafreq[23]; char cmask[MAXSEQLEN]; for (i = 0; i < seqlen; i++) cmask[i] = FALSE; /* Filter transmembrane segments */ if (tmfilt) for (i = 0; i <= seqlen - 20; i++) { for (tot = l = 0; l < 20; l++) tot += transmemsc[seq[i + l]]; if (tot >= 7500) for (l = 0; l < 20; l++) cmask[i + l] = TRUE; } /* Filter coiled-coils */ if (coilfilt) for (i = 0; i <= seqlen - 21; i++) { for (tot = 0, l = 0; l < 21; l++) tot += ccoilmat[seq[i + l]][l % 7]; if (tot > 10000) { for (l = 0; l < 21; l++) cmask[i + l] = TRUE; } } /* Filter low-complexity regions */ if (complexfilt) for (i = 0; i <= seqlen - CWIN; i++) { for (j = 0; j < 22; j++) aafreq[j] = 0; for (j = 0; j < CWIN; j++) aafreq[seq[i + j]]++; for (tot = n = j = 0; j < 22; j++) if (aafreq[j]) { tot += aafreq[j]; n++; } if (n) tot /= n; else tot = 0; if (tot > 3) { for (j = 0; j < CWIN; j++) cmask[i + j] = TRUE; } } /* Filter biased 100-residue regions */ if (biasfilt) for (i = 0; i <= seqlen - FWIN; i++) { for (j = 0; j < 22; j++) aafreq[j] = 0; for (j = 0; j < FWIN; j++) aafreq[seq[i + j]]++; for (j = 0; j < 20; j++) if (100.0 * aafreq[j] / FWIN > aamean100[j] + 5.0 * aasd100[j]) for (k = 0; k < FWIN; k++) if (seq[i + k] == j) cmask[i + k] = TRUE; } /* Filter frequently occurring amino acids in proteins < 100 aa long */ if (biasfilt && seqlen < FWIN) { for (j = 0; j < 22; j++) aafreq[j] = 0; for (i = 0; i < seqlen; i++) aafreq[seq[i]]++; for (j = 0; j < 20; j++) if (100.0 * aafreq[j] / seqlen > aamean[j] + 4.0 * aasd[j]) for (i = 0; i < seqlen; i++) if (seq[i] == j) cmask[i] = TRUE; } /* Now mask out the combined regions in the sequence */ for (i = 0; i < seqlen; i++) if (cmask[i]) seq[i] = 22; /* Output the masked sequence */ desc[strlen(desc) - 1] = '\0'; printf(">%s\n", desc); for (i = 0; i < seqlen; i++) { putchar(rescodes[seq[i]]); if (i && i != seqlen-1 && i%70 == 69) putchar('\n'); } putchar('\n');}int main(int argc, char **argv){ int i, j, seqlen = 0; char desc[65536], seq[MAXSEQLEN], buf[65536], *p; FILE *ifp; if (argc < 2) fail("Usage: pfilt [-t] [-c] [-b] [-x] fasta-file"); for (*argv++, argc--; argc && **argv == '-'; argv++, argc--) switch (*(*argv + 1)) { case 't': tmfilt = FALSE; break; case 'c': coilfilt = FALSE; break; case 'b': biasfilt = TRUE; break; case 'x': complexfilt = FALSE; break; default: fail("Usage: pfilt [-t] [-c] [-b] [-x] fasta-file"); } if (argc < 1) fail("Usage: pfilt [-t] [-c] [-b] [-x] fasta-file"); ifp = fopen(argv[0], "r"); if (!ifp) fail("Unable to open sequence file!"); while (!feof(ifp)) { if (!fgets(buf, 65536, ifp)) break; if (buf[0] == '>') { if (seqlen) filtseq(desc, seq, seqlen); seqlen = 0; strcpy(desc, buf + 1); } else { p = buf - 1; while (*++p) if (isalpha(*p)) seq[seqlen++] = aanum(*p); } } fclose(ifp); if (seqlen) filtseq(desc, seq, seqlen); return 0;}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -