Please consider a donation to the Higher Intellect project. See https://preterhuman.net/donate.php or the Donate to Higher Intellect page for more info. |
Microsatellite (repeat) searching
Jump to navigation
Jump to search
From usenet.ucs.indiana.edu!vixen.cso.uiuc.edu!howland.reston.ans.net!news.intercon.com!panix!ddsw1!news.kei.com!bloom-beacon.mit.edu!genome.wi.mit.edu!mpreeve Thu Mar 10 20:31:14 EST 1994 Article: 4779 of bionet.software Path: usenet.ucs.indiana.edu!vixen.cso.uiuc.edu!howland.reston.ans.net!news.intercon.com!panix!ddsw1!news.kei.com!bloom-beacon.mit.edu!genome.wi.mit.edu!mpreeve From: [email protected] (Mary Pat Reeve) Newsgroups: bionet.software Subject: Microsatellite (repeat) searching Date: 10 Mar 1994 19:06:23 GMT Organization: Whitehead Institute for Biomedical Research, MIT Lines: 372 Message-ID: <[email protected]> NNTP-Posting-Host: gummo.wi.mit.edu Here is a basic piece of C code that will find all permutations of microsatellite repeats. The basic idea is that it moves a window over the sequence and checks if the next window agrees with the previous one. REPEAT_THRESHOLD is the number of windows that must be the same before the repeat is marked; i.e., CACACACACACACACACACA would qualify as 10. Once the repeat is marked, that region is marked so that it is not searched for larger base unit repeats. MAX_WINDOW is the max number of units in the base repeat. CA=2, GAC=3, etc. The code starts at 2 and works up to MAX_WINDOW. This algorithm also takes into account the fact that microsatellites can be imperfect (random extra bases thrown in) and can be compound. Let me know if you have any questions. /* Copyright Whitehead Institute for Biomedical Research. All rights reserved */ #include <stdio.h> #include <string.h> #include <math.h> #include <time.h> #define FALSE 0 #define TRUE 1 #define MAX_IMPERFECT_GAP 4 /* The default maximum number of imperfect * bases allowed in a microsatellite */ #define NUM_MS 100 /* Maximum number of microsatellites in a seq */ #define MAX_WINDOW 6 /* The maximum base repeat unit length */ #define REPEAT_THRESHOLD 10 #define MAX_LINE 500000 #define MARGIN 250 #define MAX_SUBSEQ 500000 typedef struct { int length; /* Length of the microsatellite feature */ int start; /* Start of the feature in the whole sequence */ char type[MAX_WINDOW+1]; /* Base repeat unit -- CA, TCA, GA, etc */ } REPEAT_INFO; extern REPEAT_INFO *repeats; int target_start, target_end; int find_microsatellite(); void find_best_target(); REPEAT_INFO *repeats; main(argc, argv) int argc; char **argv; { int length, current_num, genbank_start = 0, i, start, end, num_ms; int start_o_target, length_o_target; char type_o_target[MAX_WINDOW+1]; char seq[MAX_LINE], name[1000], match_seq[100], new_seq[MAX_LINE]; char description[1000], origin[1000], journal[1000]; FILE *ifp; repeats = (REPEAT_INFO *) malloc ((NUM_MS+1) * sizeof(REPEAT_INFO)); /* Check that the correct arguments are given */ if (argc < 3) { printf("Usage: %s seq_file start_num\n", argv[0]); exit(1);} /* Open the files and set the target sequence*/ if((ifp = fopen(argv[1], "r")) == NULL){ printf("*** error: can't open %s\n", argv[1]); exit(1);} current_num = atoi(argv[2]); printf("1 set_organism %% |mus| %%\n"); while(fgets(name, MAX_LINE, ifp) != NULL){ name[strlen(name)-1] = '\0'; fgets(description, MAX_LINE, ifp); description[strlen(description)-1] = '\0'; fgets(origin, MAX_LINE, ifp); origin[strlen(origin)-1] = '\0'; if(strcmp(origin, " ") == 0) strcpy(origin, ""); fgets(journal, MAX_LINE, ifp); journal[strlen(journal)-1] = '\0'; fgets(seq, MAX_LINE, ifp); seq[strlen(seq)-1] = '\0'; length = (int)strlen(seq); num_ms = find_microsatellite(seq, length, MAX_WINDOW); if(num_ms == 1){ start = repeats[0].start; end = start + repeats[0].length -1; start_o_target = start; length_o_target = end - start + 1; strcpy(type_o_target,repeats[0].type); if((start - MARGIN) <= 0) start = 0; else(start -= MARGIN); genbank_start = start; start_o_target -= start; if((end + MARGIN) > length) end = length; else end += MARGIN; for(i = 0; i< (end-start+1); i++){ new_seq[i] = seq[start+i];} new_seq[i] ='\0'; strcpy(seq, new_seq); } if(num_ms > 1){ find_best_target(num_ms); start = target_start; end = target_end; start_o_target = start; length_o_target = end - start + 1; strcpy(type_o_target,repeats[0].type); /* Not really the correct target type in this case */ if((start - MARGIN) <= 0) start = 0; else(start -= MARGIN); genbank_start = start; start_o_target -= start; if((end + MARGIN) > length) end = length; else end += MARGIN; for(i = 0; i< (end-start+1); i++){ new_seq[i] = seq[start+i];} new_seq[i] ='\0'; strcpy(seq, new_seq); } if(num_ms > 0){ /* Print out the microsatellite with certain margins around it */ printf("1 put_marker %% |D%d| %%\n", current_num); printf("2 put_marker_name %% |D%d| |%s| %%\n", current_num, name); printf("3 put_genbank_info %% |D%d| || |%d| |%s| |%s| |%s| %%\n", current_num, genbank_start, origin, description, name); printf("4 put_sequence %% |D%d| |%s| || |GenBank| |%s| %%\n", current_num, seq, journal); printf("5 put_insert %% |D%d| |0| |%d| %%\n", current_num, (int)strlen(seq)); printf("6 put_target %% |D%d| |%d| |%d| |%s| %%\n", current_num, start_o_target, length_o_target, type_o_target); printf("5 put_FASTN %% |D%d| |0| || || %%\n", current_num); printf("6 put_BLAST %% |D%d| |0| || || %%\n", current_num); current_num++; fflush(stdout); } } fclose(ifp); } int find_microsatellite(seq, length, max_window) char *seq; int length, max_window; { int window_length = 2, unit_matches, gap, rep_count=0 , i=0, start, k, j; char test_seq[MAX_SUBSEQ], repeat_type[MAX_WINDOW], double_repeat[MAX_WINDOW*2], gap_sequence[2*MAX_IMPERFECT_GAP+1]; char *following_window, *leading_window, *next2, *p_start; /* Initialize the repeat structure */ for(j = 0; j<NUM_MS; j++){ repeats[j].start = 0; repeats[j].length = 0; strcpy(repeats[j].type, "");} /* Read in the target string to see if we are looking for * a number of a sequence of particular repeats * Check to make sure the window_length >= 2 <=MAX_WINDOW * Get max_window either from the specific repeat or * the number in parentheses */ strncpy(test_seq,seq,length); /* Make a copy to use */ while(window_length <= max_window){ /* Loop over the sequence once for each window size */ if(length > (window_length*2)){ /* Initialize all the pointers for the next pass over the sequence */ following_window = test_seq; /* The window following behind is set to start at the beginning of the sequence */ leading_window = following_window + window_length; /* The leading window is just ahead */ strncpy(repeat_type, following_window, window_length); repeat_type[window_length] = '\0'; unit_matches = 0; start = 0; i = 0; while(*(leading_window + window_length -1) != 0){ /* Loop over the sequence */ /* Check to see if we already found a repeat here */ if(*following_window == 'X'){ following_window += window_length; leading_window += window_length ; i += window_length; strncpy(repeat_type, following_window, window_length); repeat_type[window_length] = '\0';} else { /* Check to see if the two windows match -- */ if(ms_match(repeat_type, leading_window, window_length)){ if(unit_matches == 0){ /* we're possibly looking at a new repeat */ start = i; strncpy(repeat_type, following_window, window_length); repeat_type[window_length] = '\0'; } unit_matches++; /* we'd rather have a repeat_type without an N */ if(strchr(repeat_type, 'N') != NULL){ strncpy(repeat_type, following_window, window_length); repeat_type[window_length] = '\0';} if(strchr(repeat_type, 'N') != NULL){ strncpy(repeat_type, leading_window, window_length); repeat_type[window_length] = '\0';} following_window = leading_window = test_seq; following_window += i + window_length; /* Advance the windows */ leading_window += i + (window_length*2); i += window_length; } else{ /* if we have a decent start look ahead to see if there's more after a short gap */ strcpy(double_repeat, repeat_type); strcat(double_repeat, repeat_type); strncpy(gap_sequence, (test_seq + i + window_length), (2*MAX_IMPERFECT_GAP)); gap_sequence[strlen(gap_sequence)-1] = '\0'; next2 = strstr(gap_sequence, double_repeat); if((next2 != 0 ) && (unit_matches >=1)) { next2 = strstr((test_seq + i + window_length), double_repeat); following_window = next2; i += (next2 - leading_window); leading_window = following_window + window_length; unit_matches++; } else{ /* no match, move ahead by one */ if(unit_matches >= (REPEAT_THRESHOLD-1)){ repeats[rep_count].start = start; p_start = test_seq + start; repeats[rep_count].length = leading_window - p_start; strcpy(repeats[rep_count].type,repeat_type); /* Mark the sequence as having a repeat so we don't go over it again */ for(k = 0; k < (repeats[rep_count].length); k++) test_seq[(repeats[rep_count].start) + k] = 'X'; rep_count++;} unit_matches = 0; leading_window++; following_window++; i++; start = i; strncpy(repeat_type, following_window, window_length); /* Null terminate the string */ repeat_type[window_length] = '\0'; }}}} /* Deal with a repeat that goes right to the end */ if(unit_matches >= (REPEAT_THRESHOLD-1)){ repeats[rep_count].start = start; p_start = test_seq + start; repeats[rep_count].length = leading_window - p_start; strcpy(repeats[rep_count].type,repeat_type); /* Mark the sequence as having a repeat so we don't go over it again */ for(k = 0; k < (repeats[rep_count].length); k++) test_seq[(repeats[rep_count].start) + k] = 'X'; rep_count++;} /* Move up the window size */ window_length++; } else { window_length++; } } return(rep_count); } int ms_match(repeat_type, leading_seq, window_length) char *leading_seq, *repeat_type; int window_length; { int Ns, i, num_same = 0; /* Keep a count of the number of Ns ... we will reject the match if * there are too many */ Ns = 0; for(i = 0; i < window_length; i++){ if (!((repeat_type[i] == *(leading_seq+i)) || (*(leading_seq+i) == 'N'))) return(FALSE); /* Check that we don't have something of the form poly-A or poly-T */ if (repeat_type[0] == repeat_type[i]) num_same++; if(*(leading_seq+i) == 'N') Ns++; } /* If we made it here we have a match unless the window has too many Ns */ if(num_same == window_length) return(FALSE); return(Ns <= (window_length/2)); } void find_best_target(num_repeats) int num_repeats; { int i, j, k; REPEAT_INFO temp; target_start = 0; target_end = 0; /* Sort the repeats according to start site, see if we can fit * all the repeats into one product */ for(i = 0; i < num_repeats; ++i) for(j = num_repeats - 1; j > i; --j) if(repeats[j-1].start > repeats[j].start){ temp = repeats[j-1]; repeats[j-1] = repeats[j]; repeats[j] = temp;} if((repeats[num_repeats-1].start + repeats[num_repeats-1].length - repeats[0].start) < 1200){ target_start = repeats[0].start; target_end = repeats[num_repeats-1].start + repeats[num_repeats-1].length - 1;} else{ for(i = 0; i < num_repeats; ++i) for(j = num_repeats - 1; j > i; --j) if(repeats[j-1].length < repeats[j].length){ temp = repeats[j-1]; repeats[j-1] = repeats[j]; repeats[j] = temp;} target_start = repeats[0].start; target_end = repeats[0].start + repeats[0].length -1; } } int same_repeat_type(found_type, specified_type) char *found_type, *specified_type; { int found_length, specified_length; found_length = strlen(found_type); specified_length = strlen(specified_type); if(found_length == specified_length){ if((strcmp(found_type, specified_type) == 0)) return(TRUE);} return(FALSE); }