From usenet.ucs.indiana.edu!vixen.cso.uiuc.edu!howland.reston.ans.net!news.intercon.com!panix!ddsw1!news.kei.com!bloom-beacon.mit.edu!genome.wi.mit.edu!mpreeve Thu Mar 10 20:31:14 EST 1994
Article: 4779 of bionet.software
Path: usenet.ucs.indiana.edu!vixen.cso.uiuc.edu!howland.reston.ans.net!news.intercon.com!panix!ddsw1!news.kei.com!bloom-beacon.mit.edu!genome.wi.mit.edu!mpreeve
From: [email protected] (Mary Pat Reeve)
Newsgroups: bionet.software
Subject: Microsatellite (repeat) searching
Date: 10 Mar 1994 19:06:23 GMT
Organization: Whitehead Institute for Biomedical Research, MIT
Lines: 372
Message-ID: <[email protected]>
NNTP-Posting-Host: gummo.wi.mit.edu
Here is a basic piece of C code that will find all permutations of
microsatellite repeats. The basic idea is that it moves a window over
the sequence and checks if the next window agrees with the previous
one. REPEAT_THRESHOLD is the number of windows that must be the same
before the repeat is marked; i.e., CACACACACACACACACACA would qualify
as 10. Once the repeat is marked, that region is marked so that it is
not searched for larger base unit repeats. MAX_WINDOW is the max number
of units in the base repeat. CA=2, GAC=3, etc. The code starts at 2
and works up to MAX_WINDOW. This algorithm also takes into account the
fact that microsatellites can be imperfect (random extra bases thrown in)
and can be compound. Let me know if you have any questions.
/* Copyright Whitehead Institute for Biomedical Research. All rights
reserved
*/
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define FALSE 0
#define TRUE 1
#define MAX_IMPERFECT_GAP 4 /* The default maximum number of imperfect
* bases allowed in a microsatellite */
#define NUM_MS 100 /* Maximum number of microsatellites in a seq */
#define MAX_WINDOW 6 /* The maximum base repeat unit length */
#define REPEAT_THRESHOLD 10
#define MAX_LINE 500000
#define MARGIN 250
#define MAX_SUBSEQ 500000
typedef struct {
int length; /* Length of the microsatellite feature */
int start; /* Start of the feature in the whole sequence */
char type[MAX_WINDOW+1]; /* Base repeat unit -- CA, TCA, GA, etc */
} REPEAT_INFO;
extern REPEAT_INFO *repeats;
int target_start, target_end;
int find_microsatellite();
void find_best_target();
REPEAT_INFO *repeats;
main(argc, argv)
int argc;
char **argv;
{
int length, current_num, genbank_start = 0, i, start, end, num_ms;
int start_o_target, length_o_target;
char type_o_target[MAX_WINDOW+1];
char seq[MAX_LINE], name[1000], match_seq[100], new_seq[MAX_LINE];
char description[1000], origin[1000], journal[1000];
FILE *ifp;
repeats = (REPEAT_INFO *) malloc ((NUM_MS+1) * sizeof(REPEAT_INFO));
/* Check that the correct arguments are given */
if (argc < 3) {
printf("Usage: %s seq_file start_num\n", argv[0]);
exit(1);}
/* Open the files and set the target sequence*/
if((ifp = fopen(argv[1], "r")) == NULL){
printf("*** error: can't open %s\n", argv[1]); exit(1);}
current_num = atoi(argv[2]);
printf("1 set_organism %% |mus| %%\n");
while(fgets(name, MAX_LINE, ifp) != NULL){
name[strlen(name)-1] = '\0';
fgets(description, MAX_LINE, ifp);
description[strlen(description)-1] = '\0';
fgets(origin, MAX_LINE, ifp);
origin[strlen(origin)-1] = '\0';
if(strcmp(origin, " ") == 0) strcpy(origin, "");
fgets(journal, MAX_LINE, ifp);
journal[strlen(journal)-1] = '\0';
fgets(seq, MAX_LINE, ifp);
seq[strlen(seq)-1] = '\0';
length = (int)strlen(seq);
num_ms = find_microsatellite(seq, length, MAX_WINDOW);
if(num_ms == 1){
start = repeats[0].start;
end = start + repeats[0].length -1;
start_o_target = start;
length_o_target = end - start + 1;
strcpy(type_o_target,repeats[0].type);
if((start - MARGIN) <= 0)
start = 0;
else(start -= MARGIN);
genbank_start = start;
start_o_target -= start;
if((end + MARGIN) > length)
end = length;
else end += MARGIN;
for(i = 0; i< (end-start+1); i++){
new_seq[i] = seq[start+i];}
new_seq[i] ='\0';
strcpy(seq, new_seq);
}
if(num_ms > 1){
find_best_target(num_ms);
start = target_start;
end = target_end;
start_o_target = start;
length_o_target = end - start + 1;
strcpy(type_o_target,repeats[0].type); /* Not really the correct target
type in this case */
if((start - MARGIN) <= 0)
start = 0;
else(start -= MARGIN);
genbank_start = start;
start_o_target -= start;
if((end + MARGIN) > length)
end = length;
else end += MARGIN;
for(i = 0; i< (end-start+1); i++){
new_seq[i] = seq[start+i];}
new_seq[i] ='\0';
strcpy(seq, new_seq);
}
if(num_ms > 0){
/* Print out the microsatellite with certain margins around it */
printf("1 put_marker %% |D%d| %%\n", current_num);
printf("2 put_marker_name %% |D%d| |%s| %%\n", current_num, name);
printf("3 put_genbank_info %% |D%d| || |%d| |%s| |%s| |%s| %%\n",
current_num, genbank_start, origin, description, name);
printf("4 put_sequence %% |D%d| |%s| || |GenBank| |%s| %%\n",
current_num, seq, journal);
printf("5 put_insert %% |D%d| |0| |%d| %%\n",
current_num, (int)strlen(seq));
printf("6 put_target %% |D%d| |%d| |%d| |%s| %%\n",
current_num, start_o_target, length_o_target, type_o_target);
printf("5 put_FASTN %% |D%d| |0| || || %%\n", current_num);
printf("6 put_BLAST %% |D%d| |0| || || %%\n", current_num);
current_num++;
fflush(stdout);
}
}
fclose(ifp);
}
int find_microsatellite(seq, length, max_window)
char *seq;
int length, max_window;
{
int window_length = 2, unit_matches, gap, rep_count=0 , i=0, start, k, j;
char test_seq[MAX_SUBSEQ], repeat_type[MAX_WINDOW], double_repeat[MAX_WINDOW*2], gap_sequence[2*MAX_IMPERFECT_GAP+1];
char *following_window, *leading_window, *next2, *p_start;
/* Initialize the repeat structure */
for(j = 0; j<NUM_MS; j++){
repeats[j].start = 0;
repeats[j].length = 0;
strcpy(repeats[j].type, "");}
/* Read in the target string to see if we are looking for
* a number of a sequence of particular repeats
* Check to make sure the window_length >= 2 <=MAX_WINDOW
* Get max_window either from the specific repeat or
* the number in parentheses
*/
strncpy(test_seq,seq,length); /* Make a copy to use */
while(window_length <= max_window){ /* Loop over the sequence once for each window size */
if(length > (window_length*2)){
/* Initialize all the pointers for the next pass over the sequence */
following_window = test_seq; /* The window following behind is set to start
at the beginning of the sequence */
leading_window = following_window + window_length;
/* The leading window is just ahead */
strncpy(repeat_type, following_window, window_length);
repeat_type[window_length] = '\0';
unit_matches = 0;
start = 0;
i = 0;
while(*(leading_window + window_length -1) != 0){ /* Loop over the sequence */
/* Check to see if we already found a repeat here */
if(*following_window == 'X'){
following_window += window_length;
leading_window += window_length ;
i += window_length;
strncpy(repeat_type, following_window, window_length);
repeat_type[window_length] = '\0';}
else {
/* Check to see if the two windows match -- */
if(ms_match(repeat_type, leading_window, window_length)){
if(unit_matches == 0){ /* we're possibly looking at a new repeat */
start = i;
strncpy(repeat_type, following_window, window_length);
repeat_type[window_length] = '\0';
}
unit_matches++;
/* we'd rather have a repeat_type without an N */
if(strchr(repeat_type, 'N') != NULL){
strncpy(repeat_type, following_window, window_length);
repeat_type[window_length] = '\0';}
if(strchr(repeat_type, 'N') != NULL){
strncpy(repeat_type, leading_window, window_length);
repeat_type[window_length] = '\0';}
following_window = leading_window = test_seq;
following_window += i + window_length; /* Advance the windows */
leading_window += i + (window_length*2);
i += window_length;
}
else{
/* if we have a decent start look ahead to see if
there's more after a short gap */
strcpy(double_repeat, repeat_type);
strcat(double_repeat, repeat_type);
strncpy(gap_sequence, (test_seq + i + window_length), (2*MAX_IMPERFECT_GAP));
gap_sequence[strlen(gap_sequence)-1] = '\0';
next2 = strstr(gap_sequence, double_repeat);
if((next2 != 0 ) && (unit_matches >=1)) {
next2 = strstr((test_seq + i + window_length), double_repeat);
following_window = next2;
i += (next2 - leading_window);
leading_window = following_window + window_length;
unit_matches++;
}
else{
/* no match, move ahead by one */
if(unit_matches >= (REPEAT_THRESHOLD-1)){
repeats[rep_count].start = start;
p_start = test_seq + start;
repeats[rep_count].length = leading_window - p_start;
strcpy(repeats[rep_count].type,repeat_type);
/* Mark the sequence as having a repeat so we don't go over it again */
for(k = 0; k < (repeats[rep_count].length); k++)
test_seq[(repeats[rep_count].start) + k] = 'X';
rep_count++;}
unit_matches = 0;
leading_window++;
following_window++;
i++;
start = i;
strncpy(repeat_type, following_window, window_length);
/* Null terminate the string */
repeat_type[window_length] = '\0';
}}}}
/* Deal with a repeat that goes right to the end */
if(unit_matches >= (REPEAT_THRESHOLD-1)){
repeats[rep_count].start = start;
p_start = test_seq + start;
repeats[rep_count].length = leading_window - p_start;
strcpy(repeats[rep_count].type,repeat_type);
/* Mark the sequence as having a repeat so we don't go over it again */
for(k = 0; k < (repeats[rep_count].length); k++)
test_seq[(repeats[rep_count].start) + k] = 'X';
rep_count++;}
/* Move up the window size */
window_length++;
}
else { window_length++; }
}
return(rep_count);
}
int ms_match(repeat_type, leading_seq, window_length)
char *leading_seq, *repeat_type;
int window_length;
{
int Ns, i, num_same = 0;
/* Keep a count of the number of Ns ... we will reject the match if
* there are too many */
Ns = 0;
for(i = 0; i < window_length; i++){
if (!((repeat_type[i] == *(leading_seq+i)) || (*(leading_seq+i) == 'N')))
return(FALSE);
/* Check that we don't have something of the form poly-A or poly-T */
if (repeat_type[0] == repeat_type[i]) num_same++;
if(*(leading_seq+i) == 'N') Ns++;
}
/* If we made it here we have a match unless the window has too many Ns */
if(num_same == window_length) return(FALSE);
return(Ns <= (window_length/2));
}
void find_best_target(num_repeats)
int num_repeats;
{
int i, j, k;
REPEAT_INFO temp;
target_start = 0; target_end = 0;
/* Sort the repeats according to start site, see if we can fit
* all the repeats into one product */
for(i = 0; i < num_repeats; ++i)
for(j = num_repeats - 1; j > i; --j)
if(repeats[j-1].start > repeats[j].start){
temp = repeats[j-1];
repeats[j-1] = repeats[j];
repeats[j] = temp;}
if((repeats[num_repeats-1].start + repeats[num_repeats-1].length - repeats[0].start) < 1200){
target_start = repeats[0].start;
target_end = repeats[num_repeats-1].start + repeats[num_repeats-1].length - 1;}
else{
for(i = 0; i < num_repeats; ++i)
for(j = num_repeats - 1; j > i; --j)
if(repeats[j-1].length < repeats[j].length){
temp = repeats[j-1];
repeats[j-1] = repeats[j];
repeats[j] = temp;}
target_start = repeats[0].start;
target_end = repeats[0].start + repeats[0].length -1;
}
}
int same_repeat_type(found_type, specified_type)
char *found_type, *specified_type;
{
int found_length, specified_length;
found_length = strlen(found_type);
specified_length = strlen(specified_type);
if(found_length == specified_length){
if((strcmp(found_type, specified_type) == 0)) return(TRUE);}
return(FALSE);
}