Helix-turn-helix motif prediction ANSI C source

From [email protected] Sat Apr 24 16:13:56 1993
Received: from net.bio.net by sunflower.bio.indiana.edu
	(4.1/9.7jsm) id AA05660; Sat, 24 Apr 93 16:13:50 EST
Received: from ncar.ucar.edu by net.bio.net (5.65/IG-2.0) with SMTP 
	id AA05950; Sat, 24 Apr 93 14:12:05 -0700
Received: from midway.uchicago.edu by ncar.ucar.EDU (5.65/ NCAR Central Post Office 03/11/93)
	id AA02685; Sat, 24 Apr 93 15:12:34 MDT
Received: from wakinyan.uchicago.edu by midway.uchicago.edu Sat, 24 Apr 93 16:12:32 CDT
Return-Path: <[email protected]>
Received:  by wakinyan.uchicago.edu (4.1/UofC3.2)
	id AA09798; Sat, 24 Apr 93 16:13:06 CDT
Newsgroups: bionet.software.sources
Path: kimbark!chh9
From: [email protected] (Conrad Halling)
Subject: Helix-turn-helix motif prediction ANSI C source
Message-Id: <[email protected]>
Sender: [email protected] (News System)
Reply-To: [email protected]
Organization: University of Chicago Computing Organizations
Date: Sat, 24 Apr 1993 21:13:02 GMT
Apparently-To: [email protected]
Status: R

/*
	hth.c
	v 1.0.1
	24 April 1993
	
	This simple program predicts whether a protein contains a helix-turn-
		helix motif, using the method of:
	
	Dodd, I. B., and J. B. Egan. 1990.  Improved detection of helix-turn-
		helix DNA-binding motifs in protein sequences. Nucleic Acids Res. 
		18:5019-5026.

	This code written and donated to the public domain by:
		Conrad Halling
		Department of Molecular Genetics and Cell Biology
		University of Chicago
		920 E 58th St
		Chicago IL 60637
		
		e-mail: [email protected]
	
	How to compile this program:

		This program is written in ANSI C using THINK C 5.0.4.
		On our local UNIX system (running SunOS Release 4.1.1), it will
			not compile using cc but will compile using either gcc or acc.
		For example, to compile this program, you would type
			acc -o hth hth.c <return>
		This means, "start the acc (ANSI C Compiler) program, send
			the output to a file called "hth", and take the input
			from the file "hth.c".
		When the program is compiled, you run it by typing "hth"
			at the prompt.

	tabs = 4

		When using vi under UNIX, you can set the tabs to 4 by opening the
			file, typing escape (to go into command mode), colon (":") to go
			to the command line, and "set tabstop=4" (without the quotes).
	
	Format of input protein sequence:

		One protein sequence per file
		Single-letter code in upper case and/or lower case
		White space characters (space, tab, return, etc.) are ignored
		The program will abort if an invalid character is found
*/

#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#ifndef __HTH__
	#define __HTH__
	
	#ifndef TRUE
		#define TRUE				 1
	#endif
	
	#ifndef FALSE
		#define FALSE				 0
	#endif

	#define AMINO_ACIDS_COUNT		20
	#define MAX_SEQUENCE_LENGTH	 20000
	
	#define WINDOW_SIZE				22		/* These values from Table 3	*/
	#define NON_HTH_MEAN_SCORE	   238.71	/* of Dodd and Egan (1991)		*/
	#define NON_HTH_STD_DEV		   293.61

	/*
		Errors
	*/
	
	#define NO_ERROR				 0
	#define SEQUENCE_TOO_SHORT		 1
	#define OUT_OF_MEMORY			 2
	#define QUIT					 3
	#define INVALID_CHAR			 4
	
	/*
		Function prototypes
	*/
	
	void	DisplayError(
				short			error );
	void	DisplayResults(
				double			convertedScore,
				size_t			maxScorePosition,
				const char		*sequence );
	short	GetAminoAcid(
				char			residue );

#endif


const short weightMatrix[ AMINO_ACIDS_COUNT ][ WINDOW_SIZE ] =
	{
	/* A (alanine) */
		-125, -194,  -84,   70,   36,   54,  238,  -15,   77,   26, -194,
		-194,  -56,  -84,   14,   77,  -56,  -56,  -56,   46, -195,   36,
	/* C (cysteine) */
		 -64,  -64,  -63,  -63,  -64,  -64,  -64,  -64,  -64,   47,   47,
		 -63,  -63,  -64,  -64,  -64,  -64,  -64,  -64,  -63,  -64,   47,
	/* D (aspartate) */
		-156, -154, -156, -154,  109, -156, -156,  109, -154, -156,    6,
		-156, -154,  -85, -156, -156, -156, -156, -154, -154, -156,  -85,
	/* E (glutamate) */
		 -31,   -9, -171,   70,  156, -171, -171,  107,   50,  -60,  -60,
		-171,  -60,   78,   86, -171, -171, -101, -171, -170,   86,    9,
	/* F (phenylalanine) */
		  10, -130,   10, -130, -130,   10, -130, -129, -130,  102, -130,
		-130, -130, -130, -129, -130, -130, -129, -129, -129,  180, -130,
	/* G (glycine) */
		  30,    5, -190,  -51, -191, -191,   18, -191, -191, -191,  202,
		-191,  -10, -191,    5, -190, -191,  -80, -190, -190, -191,  -51,
	/* H (histidine ) */
		  62,   33,  -76,  -76,   -7,  -78,  -78,   33,   -7,  -78,   84,
		 -78,   33,   33,  -78,   -7,  -78,   -7,   62,   84,  -78,   -7,
	/* I (isoleucine ) */
		  75, -156,  101,  -45,  -86,  116, -156,  -16,   65,  -16, -156,
		 128, -156,  -86, -156, -155,  188, -155,  -16,   53,  122, -155,
	/* K (lysine ) */
		 -31,  -31,   10,   70,   79, -170, -170,   94,   70, -171,   -9,
		-100, -100,   25, -100, -170, -171,   -9,   38,  -31,   -9,  101,
	/* L (leucine) */
		  66, -212,   72, -213, -212,  144, -213, -102,   37,  132, -213,
		  97, -213, -142, -212, -212,   97, -212, -212,   37,   88, -213,
	/* M (methionine) */
		 122,  -74,   -3,  -73,  -73,  -73,  -74,  -74,   88,  122,  -73,
		 158,  -74,  -74,   -3,  -74,   -3,  -74,  -74,   -3,  -74,  -73,
	/* N (asparagine) */
		-137,   72, -137, -136, -137, -137, -137,  -67, -136, -136,  128,
		-137,   72, -136,    2,  -67, -137,    2,   84, -137, -137,  104,
	/* P (proline) */
		-156,   23, -157, -156, -157, -157, -157, -157, -157, -157, -157,
		-157,  -46,  101,   39, -157, -157, -157, -157, -157, -157,  -46,
	/* Q (glutamine) */
		 -60, -130,  175,   90,  110, -131, -131,   90,   78, -131, -131,
		 -60, -130,  154,   65,  119, -131,  -20,  119,   31,  -20,   90,
	/* R (arginine) */
		  65,   76,  110,   65,    7, -155, -154,  123,   76, -155, -154,
		-155, -154,  129,   54,   40, -155,  129,  179,  -45, -155,  123,
	/* S (serine) */
		-118,   96, -188,   21,  -48, -187,   -8, -187, -118, -118,  -77,
		-188,  174, -187,  135,  -26, -188,  150,  -77, -188, -187,  -26,
	/* T (threonine) */
		  11,  149,  -59,   80, -169,   -8, -170,  -99,  -99,  -30, -170,
		  -8,  131,  -30,  -59,  198, -170,  -30, -169, -170,  -30,  -59,
	/* V (valine) */
		  17,  -67, -177, -177, -108,  100, -178, -178, -108,   71, -178,
		 160, -178,  -16,  -67,  -67,  169, -178,   17,   31,   17, -178,
	/* W (tryptophan) */
		  44,  -26,  -26,  -26,  -26,  -26,  -26,  -26,  -26,  -26,  -26,
		 -26,  -26,  -25,  -26,  -25,  -26,  -26,   44,  279,  -26,  -26,
	/* Y (tyrosine) */
		 -40, -110,   30,    1, -110, -109, -110, -110,  -40,   30, -110,
		-109, -109,  -40, -110,  -40, -110,  162,   52,   86, -110, -110
	};

const char aminoAcidsString[] = "ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy";


int main()
	{
	char		fileName[ FILENAME_MAX ],
				format[ 12 ],
				residue,
				*sequence;
	int			resultsDisplayed,
				theChar;
	short		aminoAcid,
				fileNameEntered,
				maxScore,
				quit,
				status,
				tempScore;
	size_t		i,
				j,
				length,
				maxWindowPosition,
				maxScorePosition,
				sequenceLength;
	double		convertedScore;
	FILE		*sequenceFile;
	
	status = NO_ERROR;
	quit = FALSE;
	
	printf( "Welcome to hth, a program that predicts whether a protein contains\n" );
	printf( "a helix-turn-helix motif.\n\n" );
	printf( "For more information, please read\n\n" );
	printf( "    Dodd, I. B., and J. B. Egan. 1990.  Improved detection of\n" );
	printf( "        helix-turn-helix DNA-binding motifs in protein sequences.\n" );
	printf( "        Nucleic Acids Res. 18:5019-5026.\n\n" );
	printf( "Please be sure that your protein sequence is in \"plain\" format\n" );
	printf( "using the single-letter code.\n\n" );

	while ( !quit )
		{
	
		/*
			Allocate memory for the protein sequence.
		*/
	
		sequence = ( char * ) malloc( MAX_SEQUENCE_LENGTH * sizeof( char ) );
		if ( NULL == sequence )
			status = OUT_OF_MEMORY;

		/*
			Get the name of the sequence file.
		*/

		fileNameEntered = FALSE;
		while ( !fileNameEntered )
			{
			printf( "Name of protein file (q/Q = Quit):  " );
			sprintf( format, "%%%us", FILENAME_MAX );
			scanf( format, fileName );
			length = strlen( fileName );
			if ( 0 != length )
				{
				/*
					Check for quit command.
				*/
				
				if ( ( 1 == length ) &&
					( ( fileName[ 0 ] == 'q' ) || ( fileName[ 0 ] == 'Q' ) ) )
					{
					status = QUIT;
					quit = TRUE;
					break;
					}
				else
					{
					/*
						Open the sequence file.
					*/
			
					sequenceFile = fopen( fileName, "r" );
					if ( NULL == sequenceFile )
						printf( "File not found!\n\n" );
					else
						fileNameEntered = TRUE;
					}
				}
			}

		if ( NO_ERROR == status )
			{
			/*
				Read the sequence from sequenceFile into sequence[].
			*/
			
			i = 0;
			while ( i < MAX_SEQUENCE_LENGTH )
				{
				/*
					Stop reading at end of file.
				*/
				
				theChar = getc( sequenceFile );
				if ( feof( sequenceFile ) )
					break;
	
				/*
					Skip white space characters.
				*/
				
				if ( isspace( theChar ) )
					continue;
	
				/*
					GetAminoAcid will return AMINO_ACIDS_COUNT if the character
						is not found in AminoAcidsString[].
				*/
	
				if ( AMINO_ACIDS_COUNT == GetAminoAcid( ( char ) theChar ) )
					{
					status = INVALID_CHAR;
					break;
					}
	
				/*
					The character is valid; add it to the string.
				*/
	
				sequence[ i++ ] = ( char ) theChar;
				}
	
			if ( NO_ERROR == status )
				sequence[ i++ ] = 0x00;
		
			fclose( sequenceFile );
			}
	
		if ( NO_ERROR == status )
			{
			sequenceLength = strlen( sequence );
			if ( sequenceLength < WINDOW_SIZE )
				status = SEQUENCE_TOO_SHORT;
			}
		
		if ( NO_ERROR == status )
			{
			/*
				Calculate the highest score for the sequence.
			*/
	
			maxScore = SHRT_MIN;	/* defined in <limits.h> */
			maxScorePosition = 0;
			maxWindowPosition = sequenceLength - WINDOW_SIZE;
			for ( i = 0; i < maxWindowPosition; i++ )
				{
				tempScore = 0;
				for ( j = 0; j < WINDOW_SIZE; j++ )
					{
					residue = sequence[ i + j ];
					aminoAcid = GetAminoAcid( residue );
					tempScore += weightMatrix[ aminoAcid ] [ j ];
					}
				if ( tempScore > maxScore )
					{
					maxScore = tempScore;
					maxScorePosition = i;
					convertedScore =
						( ( double ) maxScore - NON_HTH_MEAN_SCORE ) /
							( NON_HTH_STD_DEV );
					if ( convertedScore >= 2.5 )
						{
						DisplayResults(
							convertedScore,
							maxScorePosition,
							sequence );
						resultsDisplayed = TRUE;
						maxScore = SHRT_MIN;
						}
					}
				}
			if ( !resultsDisplayed )
				DisplayResults( convertedScore, maxScorePosition, sequence );
			}
	
		DisplayError( status );
		}
	}


void DisplayResults(
	double			convertedScore,
	size_t			maxScorePosition,
	const char		*sequence )
	{
	char			maxScoreString[ WINDOW_SIZE ];
	short			i,
					percentage;
	
	printf(
		"The score is %0.2f at position %ld.\n", 
		convertedScore,
		maxScorePosition + 1 );

	for ( i = 0; i < WINDOW_SIZE; i++ )
		maxScoreString[ i ] = sequence[ maxScorePosition + i ];
	maxScoreString[ i ] = 0x00;

	printf(
		"The sequence at this position is %s.\n",
		maxScoreString );

	if ( convertedScore >= 4.5 )
		percentage = 100;
	else if ( convertedScore >= 4.0 )
		percentage = 90;
	else if ( convertedScore >= 3.5 )
		percentage = 71;
	else if ( convertedScore >= 3.0 )
		percentage = 50;
	else if ( convertedScore >= 2.5 )
		percentage = 25;

	if ( convertedScore < 2.5 )
		printf( "This score is not significant.\n" );
	else
		{
		printf(
			"This score suggests an approximately %d%% probability that ",
			percentage );
		printf( "this protein\ncontains a helix-turn-helix motif.\n\n" );
		}
	}
				

short GetAminoAcid(
	char		residue )
	{
	short		i,
				limit;
	
	limit = strlen( aminoAcidsString );
	for ( i = 0; i < limit; i++ )
		{
		if ( residue == aminoAcidsString[ i ] )
			break;
		}
	if ( i >= AMINO_ACIDS_COUNT )
		i -= AMINO_ACIDS_COUNT;

	return ( i );
	}


void DisplayError(
	short		error )
	{
	
	char		errorString[5][80] =
		{
		"\n\n",
		"The protein sequence is too short to analyze.\n\n",
		"There is insufficient memory to continue.\n\n",
		"Good-bye!\n\n",
		"There is an invalid character in the protein sequence.\n\n"
		};

	printf( errorString[ error ] );
	}
-- 
Conrad Halling
[email protected]