mirror of
https://github.com/BioArchLinux/Packages.git
synced 2025-03-10 12:02:42 +00:00
203 lines
7.5 KiB
Diff
203 lines
7.5 KiB
Diff
--- a/sequence.c 2023-03-04 23:37:47.000573653 +0530
|
|
+++ b/sequence.c 2023-03-04 23:38:55.239315879 +0530
|
|
@@ -24,12 +24,12 @@
|
|
Read the sequence for training purposes. If we encounter multiple
|
|
sequences, we insert TTAATTAATTAA between each one to force stops in all
|
|
six frames. When we hit MAX_SEQ bp, we stop and return what we've got so
|
|
- far for training. This routine reads in FASTA, and has a very 'loose'
|
|
- Genbank and Embl parser, but, to be safe, FASTA should generally be
|
|
+ far for training. This routine reads in FASTA, and has a very 'loose'
|
|
+ Genbank and Embl parser, but, to be safe, FASTA should generally be
|
|
preferred.
|
|
*******************************************************************************/
|
|
|
|
-int read_seq_training(FILE *fp, unsigned char *seq, unsigned char *useq,
|
|
+int read_seq_training(fptr fp, unsigned char *seq, unsigned char *useq,
|
|
double *gc, int do_mask, mask *mlist, int *nm) {
|
|
char line[MAX_LINE+1];
|
|
int hdr = 0, fhdr = 0, bctr = 0, len = 0, wrn = 0;
|
|
@@ -37,7 +37,7 @@
|
|
unsigned int i, gapsize = 0;
|
|
|
|
line[MAX_LINE] = '\0';
|
|
- while(fgets(line, MAX_LINE, fp) != NULL) {
|
|
+ while(INPUT_GETS(line, MAX_LINE, fp) != NULL) {
|
|
if(hdr == 0 && line[strlen(line)-1] != '\n' && wrn == 0) {
|
|
wrn = 1;
|
|
fprintf(stderr, "\n\nWarning: saw non-sequence line longer than ");
|
|
@@ -58,7 +58,7 @@
|
|
else if(hdr == 1 && (line[0] == '/' && line[1] == '/')) hdr = 0;
|
|
else if(hdr == 1) {
|
|
if(strstr(line, "Expand") != NULL && strstr(line, "gap") != NULL) {
|
|
- sscanf(strstr(line, "gap")+4, "%u", &gapsize);
|
|
+ sscanf(strstr(line, "gap")+4, "%u", &gapsize);
|
|
if(gapsize < 1 || gapsize > MAX_LINE) {
|
|
fprintf(stderr, "Error: gap size in gbk file can't exceed line");
|
|
fprintf(stderr, " size.\n");
|
|
@@ -73,7 +73,7 @@
|
|
if(len - mask_beg >= MASK_SIZE) {
|
|
if(*nm == MAX_MASKS) {
|
|
fprintf(stderr, "Error: saw too many regions of 'N''s in the ");
|
|
- fprintf(stderr, "sequence.\n");
|
|
+ fprintf(stderr, "sequence.\n");
|
|
exit(52);
|
|
}
|
|
mlist[*nm].begin = mask_beg;
|
|
@@ -93,8 +93,8 @@
|
|
set(seq, bctr+1);
|
|
gc_cont++;
|
|
}
|
|
- else if(line[i] != 'a' && line[i] != 'A') {
|
|
- set(seq, bctr+1);
|
|
+ else if(line[i] != 'a' && line[i] != 'A') {
|
|
+ set(seq, bctr+1);
|
|
set(useq, len);
|
|
}
|
|
bctr+=2; len++;
|
|
@@ -119,7 +119,7 @@
|
|
|
|
/* This routine reads in the next sequence in a FASTA/GB/EMBL file */
|
|
|
|
-int next_seq_multi(FILE *fp, unsigned char *seq, unsigned char *useq,
|
|
+int next_seq_multi(fptr fp, unsigned char *seq, unsigned char *useq,
|
|
int *sctr, double *gc, int do_mask, mask *mlist, int *nm,
|
|
char *cur_hdr, char *new_hdr) {
|
|
char line[MAX_LINE+1];
|
|
@@ -131,7 +131,7 @@
|
|
|
|
if(*sctr > 0) reading_seq = 1;
|
|
line[MAX_LINE] = '\0';
|
|
- while(fgets(line, MAX_LINE, fp) != NULL) {
|
|
+ while(INPUT_GETS(line, MAX_LINE, fp) != NULL) {
|
|
if(reading_seq == 0 && line[strlen(line)-1] != '\n' && wrn == 0) {
|
|
wrn = 1;
|
|
fprintf(stderr, "\n\nWarning: saw non-sequence line longer than ");
|
|
@@ -169,7 +169,7 @@
|
|
}
|
|
else if(reading_seq == 1) {
|
|
if(strstr(line, "Expand") != NULL && strstr(line, "gap") != NULL) {
|
|
- sscanf(strstr(line, "gap")+4, "%u", &gapsize);
|
|
+ sscanf(strstr(line, "gap")+4, "%u", &gapsize);
|
|
if(gapsize < 1 || gapsize > MAX_LINE) {
|
|
fprintf(stderr, "Error: gap size in gbk file can't exceed line");
|
|
fprintf(stderr, " size.\n");
|
|
@@ -184,7 +184,7 @@
|
|
if(len - mask_beg >= MASK_SIZE) {
|
|
if(*nm == MAX_MASKS) {
|
|
fprintf(stderr, "Error: saw too many regions of 'N''s in the ");
|
|
- fprintf(stderr, "sequence.\n");
|
|
+ fprintf(stderr, "sequence.\n");
|
|
exit(55);
|
|
}
|
|
mlist[*nm].begin = mask_beg;
|
|
@@ -204,8 +204,8 @@
|
|
set(seq, bctr+1);
|
|
gc_cont++;
|
|
}
|
|
- else if(line[i] != 'a' && line[i] != 'A') {
|
|
- set(seq, bctr+1);
|
|
+ else if(line[i] != 'a' && line[i] != 'A') {
|
|
+ set(seq, bctr+1);
|
|
set(useq, len);
|
|
}
|
|
bctr+=2; len++;
|
|
@@ -240,14 +240,14 @@
|
|
|
|
/* Takes rseq and fills it up with the rev complement of seq */
|
|
|
|
-void rcom_seq(unsigned char *seq, unsigned char *rseq, unsigned char *useq,
|
|
+void rcom_seq(unsigned char *seq, unsigned char *rseq, unsigned char *useq,
|
|
int len) {
|
|
int i, slen=len*2;
|
|
for(i = 0; i < slen; i++)
|
|
if(test(seq, i) == 0) set(rseq, slen-i-1+(i%2==0?-1:1));
|
|
for(i = 0; i < len; i++) {
|
|
if(test(useq, i) == 1) {
|
|
- toggle(rseq, slen-1-i*2);
|
|
+ toggle(rseq, slen-1-i*2);
|
|
toggle(rseq, slen-2-i*2);
|
|
}
|
|
}
|
|
@@ -564,7 +564,7 @@
|
|
}
|
|
|
|
/*******************************************************************************
|
|
- Creates a GC frame plot for a given sequence. This is simply a string with
|
|
+ Creates a GC frame plot for a given sequence. This is simply a string with
|
|
the highest GC content frame for a window centered on position for every
|
|
position in the sequence.
|
|
*******************************************************************************/
|
|
@@ -654,7 +654,7 @@
|
|
}
|
|
|
|
/*******************************************************************************
|
|
- Finds the highest-scoring region similar to AGGAGG in a given stretch of
|
|
+ Finds the highest-scoring region similar to AGGAGG in a given stretch of
|
|
sequence upstream of a start.
|
|
*******************************************************************************/
|
|
|
|
@@ -663,14 +663,14 @@
|
|
double match[6], cur_ctr, dis_flag;
|
|
|
|
limit = imin(6, start-4-pos);
|
|
- for(i = limit; i < 6; i++) match[i] = -10.0;
|
|
+ for(i = 0; i < 6; i++) match[i] = -10.0;
|
|
|
|
/* Compare the 6-base region to AGGAGG */
|
|
for(i = 0; i < limit; i++) {
|
|
- if(pos+i < 0) continue;
|
|
- if(i%3 == 0 && is_a(seq, pos+i) == 1) match[i] = 2.0;
|
|
- else if(i%3 != 0 && is_g(seq, pos+i) == 1) match[i] = 3.0;
|
|
- else match[i] = -10.0;
|
|
+ if(pos + i >= 0) {
|
|
+ if(i%3 == 0 && is_a(seq, pos+i) == 1) match[i] = 2.0;
|
|
+ else if(i%3 != 0 && is_g(seq, pos+i) == 1) match[i] = 3.0;
|
|
+ }
|
|
}
|
|
|
|
/* Find the maximally scoring motif */
|
|
@@ -730,7 +730,7 @@
|
|
}
|
|
|
|
/*******************************************************************************
|
|
- Finds the highest-scoring region similar to AGGAGG in a given stretch of
|
|
+ Finds the highest-scoring region similar to AGGAGG in a given stretch of
|
|
sequence upstream of a start. Only considers 5/6-mers with 1 mismatch.
|
|
*******************************************************************************/
|
|
|
|
@@ -739,18 +739,19 @@
|
|
double match[6], cur_ctr, dis_flag;
|
|
|
|
limit = imin(6, start-4-pos);
|
|
- for(i = limit; i < 6; i++) match[i] = -10.0;
|
|
+ for(i = 0; i < 6; i++) match[i] = -10.0;
|
|
|
|
/* Compare the 6-base region to AGGAGG */
|
|
for(i = 0; i < limit; i++) {
|
|
- if(pos+i < 0) continue;
|
|
- if(i % 3 == 0) {
|
|
- if(is_a(seq, pos+i) == 1) match[i] = 2.0;
|
|
- else match[i] = -3.0;
|
|
- }
|
|
- else {
|
|
- if(is_g(seq, pos+i) == 1) match[i] = 3.0;
|
|
- else match[i] = -2.0;
|
|
+ if(pos+i >= 0) {
|
|
+ if(i % 3 == 0) {
|
|
+ if(is_a(seq, pos+i) == 1) match[i] = 2.0;
|
|
+ else match[i] = -3.0;
|
|
+ }
|
|
+ else {
|
|
+ if(is_g(seq, pos+i) == 1) match[i] = 3.0;
|
|
+ else match[i] = -2.0;
|
|
+ }
|
|
}
|
|
}
|
|
|
|
@@ -801,4 +802,4 @@
|
|
int imin(int x, int y) {
|
|
if(x < y) return x;
|
|
return y;
|
|
-}
|
|
+}
|
|
\ No newline at end of file
|