program AmbiguityRemover; {Written by: John H. McDonald} { Department of Biological Sciences} { University of Delaware} { Newark, Delaware 19716} { http://udel.edu/~mcdonald} { mcdonald@udel.edu} const MaxGeneLength = 10000; var myInput: text; {the input data file} myOutput: text; {the main output data file} mySummaryOutput: text; {the summary output file} deGapYes: char; {y/n response from user} overWriteYes: char; {y/n response from user} actionChar: char; {y/n response from user} inputCheck: boolean; {used in checking the user input for duplicate file names, etc.} gapEdgeNum: integer; {number of identical sites marking edge of ambiguous region; input from user} lastSlashPos: integer; {used in separating file path from file name} stringIndex: integer; {used in separating file path from file name} InputFileString: string[250]; {user-provided name of input file} OutputFileString: string[250]; {user-provided name of output file} outputFileFullString: string[250]; {path+name of output file} inputFileFullString: string[250]; {path+name of input file} inputPath: string[250]; {path of program file, prepended to user-input file names to make full names} {=========This function converts amino acid symbols to numbers, for easier manipulation.================} function letterToNumber(x:char):integer; var letter:char; result:integer; begin letter:=x; case letter of 'A': result:=1; 'C': result:=2; 'D': result:=3; 'E': result:=4; 'F': result:=5; 'G': result:=6; 'H': result:=7; 'I': result:=8; 'K': result:=9; 'L': result:=10; 'M': result:=11; 'N': result:=12; 'P': result:=13; 'Q': result:=14; 'R': result:=15; 'S': result:=16; 'T': result:=17; 'V': result:=18; 'W': result:=19; 'Y': result:=20; '-': result:=21; 'X': result:=22; otherwise result:=23; end; letterToNumber:=result; end; {====This function converts numbers back into amino acid symbols, for output.====} function NumberToLetter(x:integer):string; var number:integer; result:string; begin number:=x; case number of 1: result:='A'; 2: result:='C'; 3: result:='D'; 4: result:='E'; 5: result:='F'; 6: result:='G'; 7: result:='H'; 8: result:='I'; 9: result:='K'; 10: result:='L'; 11: result:='M'; 12: result:='N'; 13: result:='P'; 14: result:='Q'; 15: result:='R'; 16: result:='S'; 17: result:='T'; 18: result:='V'; 19: result:='W'; 20: result:='Y'; 21: result:='-'; 22: result:='X'; 27: result:='a'; 28: result:='c'; 29: result:='d'; 30: result:='e'; 31: result:='f'; 32: result:='g'; 33: result:='h'; 34: result:='i'; 35: result:='k'; 36: result:='l'; 37: result:='m'; 38: result:='n'; 39: result:='p'; 40: result:='q'; 41: result:='r'; 42: result:='s'; 43: result:='t'; 44: result:='v'; 45: result:='w'; 46: result:='y'; 47: result:='-'; 48: result:='x'; otherwise result:='?'; end; numberToLetter:=result; end; {====Reads in the data from the input file and puts ambiguously aligned sites next to gaps in lower case. ===} procedure DoAnalyzeInput; var geneName1, geneName2: string; {the name of each gene} aminoAcid: char; {amino acid symbols, as read from the input file} geneCount: longint; {number of genes} arrayIndex: longint; {index used in loops} AminoAcidCount1: longint; {number of amino acids in gene in species 1} AminoAcidCount2: longint; {number of amino acids in gene in species 2} bigArray:array[1..2,1..maxGeneLength] of integer; {array of amino acid sequences for one gene in two species} numberAminoAcid: integer; {amino acid symbol translated into a number} edgeIndex: integer; {index used in loops} matchCount: longint; {number of matching amino acids in a window gapEdgeNum wide} spaceIndex: integer; {index used in loops} addedSpace: integer; {used in making gene names 10 characters long} totalSites: longint; {total number of amino acids in a gene} totalDiffs: longint; {total number of sites with different amino acids between species in a gene} alignedSites: longint; {total number of amino acids in a gene that are unambiguously aligned} alignedDiffs: longint; {total number of unambiguously aligned sites with different amino acids between species in a gene} begin read(myInput, aminoAcid); {reads first '>'} readln(myInput,geneName1); if length(geneName1)>10 then {make gene name ten characters long} delete(geneName1, 11, (length(geneName1)-10)); if length(geneName1)<10 then begin addedSpace:=10-length(geneName1); for spaceIndex:=1 to addedSpace do geneName1:=geneName1+' '; end; geneCount:=0; writeln(mySummaryOutput, ' gene name total length without gaps unambig. aligned'); writeln(mySummaryOutput, '------------------- ------------ ------------- ----------------'); writeln(mySummaryOutput, ' sp. 1 sp. 2 sp. 1 sp. 2 diff. length diff. length'); while not eof(myInput) do begin aminoAcidCount1:=0; aminoAcid:='X'; geneCount:=geneCount+1; while (aminoAcid<>'>') and (not eof(myInput)) do {'>'=start of gene from second species} begin read(myInput,aminoAcid); numberAminoAcid:=letterToNumber(aminoAcid); if numberAminoAcid<23 then begin aminoAcidCount1:=AminoAcidCount1+1; bigArray[1,aminoAcidCount1]:=lettertonumber(aminoAcid); {convert letters to numbers for easier manipulation} end; end; aminoAcid:='X'; readln(myInput,geneName2); {start of gene from second species} if length(geneName2)>10 then delete(geneName2, 11, (length(geneName2)-10)); if length(geneName2)<10 then begin addedSpace:=10-length(geneName2); for spaceIndex:=1 to addedSpace do geneName2:=geneName2+' '; end; aminoAcidCount2:=0; while (aminoAcid<>'>') and (not eof(myInput)) do {'>'=start of next gene} begin read(myInput,aminoAcid); numberAminoAcid:=letterToNumber(aminoAcid); if numberAminoAcid<23 then begin aminoAcidCount2:=AminoAcidCount2+1; bigArray[2,aminoAcidCount2]:=lettertonumber(aminoAcid); end; end; if aminoAcidCount1=aminoAcidCount2 then {if not equal, there is a problem} begin for arrayIndex:=1 to aminoAcidCount1 do {make sites aligned with gaps lower case} begin if (bigArray[1,arrayIndex]=21) or (bigArray[2,arrayIndex]=21) then begin bigArray[1,arrayIndex]:=bigArray[1,arrayIndex]+26; bigArray[2,arrayIndex]:=bigArray[2,arrayIndex]+26; end; end; {go forward through the aligned sequences from gap to N adjacent matching sites} for arrayIndex:=1 to (aminoAcidCount1-gapEdgeNum) do begin if bigArray[1,arrayIndex]>26 then {there is a "-" in one species} begin matchCount:=0; for edgeIndex:=(arrayIndex+1) to (arrayIndex+gapEdgeNum) do begin if (bigArray[1, edgeIndex]<21) and (bigArray[1, edgeIndex]=bigArray[2, edgeIndex]) then matchCount:=matchCount+1; end; if (matchCount20) and (gapEdgeNum>1) then begin for edgeIndex:=(aminoAcidCount1-gapEdgeNum+2) to aminoAcidCount1 do begin if bigArray[1, edgeIndex]<26 then begin bigArray[1, edgeIndex]:=bigArray[1, edgeIndex]+26; bigArray[2, edgeIndex]:=bigArray[2, edgeIndex]+26; end; end; end; {go backwards through the aligned sequences from gap to adjacent matching sites} for arrayIndex:=aminoAcidCount1 downto (gapEdgeNum+1) do begin if bigArray[1,arrayIndex]>26 then {there is a "-" in one species} begin matchCount:=0; for edgeIndex:=(arrayIndex-1) downto (arrayIndex-gapEdgeNum) do begin if (bigArray[1, edgeIndex]<21) and (bigArray[1, edgeIndex]=bigArray[2, edgeIndex]) then matchCount:=matchCount+1; end; if (matchCount20) and (gapEdgeNum>1) then begin for edgeIndex:=(gapEdgeNum-1) downto 1 do begin if bigArray[1, edgeIndex]<26 then begin bigArray[1, edgeIndex]:=bigArray[1, edgeIndex]+26; bigArray[2, edgeIndex]:=bigArray[2, edgeIndex]+26; end; end; end; {tidy up the ends} for arrayIndex:=(aminoAcidCount1-gapEdgeNum+1) to aminoAcidCount1 do begin if bigArray[1, arrayIndex]>21 then begin for edgeIndex:=arrayIndex to aminoAcidCount1 do begin if bigArray[1, edgeIndex]<26 then begin bigArray[1, edgeIndex]:=bigArray[1, edgeIndex]+26; bigArray[2, edgeIndex]:=bigArray[2, edgeIndex]+26; end; end; end; end; for arrayIndex:=gapEdgeNum downto 1 do begin if bigArray[1, arrayIndex]>21 then begin for edgeIndex:=arrayIndex downto 1 do begin if bigArray[1, edgeIndex]<26 then begin bigArray[1, edgeIndex]:=bigArray[1, edgeIndex]+26; bigArray[2, edgeIndex]:=bigArray[2, edgeIndex]+26; end; end; end; end; {put sites with 'X' in either species in lower case} for arrayIndex:=1 to aminoAcidCount1 do begin if (bigArray[1, arrayIndex]=22) or (bigArray[2, arrayIndex]=22) then begin bigArray[1, arrayIndex]:=bigArray[1, arrayIndex]+26; bigArray[2, arrayIndex]:=bigArray[2, arrayIndex]+26; end; end; end; {if aminocount1=aminocount2} writeln(myOutput,'>',geneName1); for arrayIndex:=1 to aminoAcidCount1 do write(myOutput,numbertoletter(bigArray[1,arrayIndex])); writeln(myOutput); writeln(myOutput,'>',geneName2); for arrayIndex:=1 to aminoAcidCount2 do write(myOutput,numbertoletter(bigArray[2,arrayIndex])); writeln(myOutput); {count differences for the summary output} if (aminoAcidCount1=aminoAcidCount2) then begin totalSites:=0; totalDiffs:=0; alignedSites:=0; alignedDiffs:=0; for arrayIndex:=1 to aminoAcidCount1 do begin if (bigArray[1, arrayIndex]<>47) and (bigArray[2, arrayIndex]<>47) then begin totalSites:=totalSites+1; if bigArray[1, arrayIndex]<>bigArray[2, arrayIndex] then totalDiffs:=totalDiffs+1; if bigArray[1, arrayIndex]<21 then begin alignedSites:=alignedSites+1; if bigArray[1, arrayIndex]<>bigArray[2, arrayIndex] then alignedDiffs:=alignedDiffs+1; end; end; end; writeln(mySummaryOutput, geneName1:10, ' ', geneName2:10, ' ', aminoAcidCount1:9, aminoAcidCount2:7, totalDiffs/totalSites:12:3, totalSites:6, alignedDiffs/alignedSites:13:3, alignedSites:6); end; if (aminoAcidCount1<>aminoAcidCount2) then writeln(mySummaryOutput, geneName1:10, ' ', geneName2:10, ' ', aminoAcidCount1:9, aminoAcidCount2:7, ' 0', ' 0', ' 0', ' 0'); if not eof(myInput) then begin readln(myInput,geneName1); if length(geneName1)>10 then delete(geneName1, 11, (length(geneName1)-10)); if length(geneName1)<10 then begin addedSpace:=10-length(geneName1); for spaceIndex:=1 to addedSpace do geneName1:=geneName1+' '; end; end; end; {while not eof} end; begin writeln('================================================================='); writeln; writeln(' Welcome to AmbiguityRemover.'); writeln; writeln('For help on formatting the input file and other documentation, '); writeln(' or a version of this program for a different operating system, '); writeln(' see http://udel.edu/~mcdonald/asymmetryprograms.html.'); writeln; writeln; writeln('Would you like to ignore sites adjacent to gaps in the alignment? '); write('Enter "y" or "n": '); readln(deGapYes); if deGapYes='Y' then deGapYes:='y'; if deGapYes<>'y' then deGapYes:='n'; writeln; if deGapYes='y' then begin inputCheck:=TRUE; while inputCheck do begin writeln('How many adjacent aligned sites mark the edge of an ambiguously aligned region?'); write('Enter a number from 1 to 10: '); readln(gapEdgeNum); writeln; if (gapEdgeNum>0) and (gapEdgeNum<11) then inputCheck:=FALSE; if inputCheck then writeln('Enter a number from 1 to 10: '); end; end; writeln; inputCheck:=TRUE; {get the name of the input file; if it doesn't exist, keep trying} while inputCheck do begin writeln('The input file must be a plain text file (extension .txt)'); writeln(' in the same folder as this program.'); writeln; write('Name of the input file: '); readln(InputFileString); if InputFileString[length(InputFileString)-3]<>'.' then InputFileString:=InputFileString+'.txt'; {The next few lines get the full path to the program, remove the name of the program, then add the name of the input file. The result is that "inputFileString" is converted from "input.txt" to "/Users/username/sliderfolder/input.txt" (on a Mac) or "C:\sliderfolder\input.txt" (on Windows).} inputPath:=paramStr(0); lastSlashPos:=0; for stringIndex:=1 to length(inputPath) do begin if (inputPath[stringIndex]='/') or (inputPath[stringIndex]='\') then lastSlashPos:=stringIndex; end; delete(inputPath, lastSlashPos+1, length(inputPath)-lastSlashPos); inputFileFullString:=inputPath+inputFileString; assign(myInput, inputFileFullString); {$I-} reset(myInput); {$I+} if IOresult=0 then inputCheck:=FALSE; if inputCheck then writeln('I can''t find a file named "',inputFileString,'", try again.'); end; inputCheck:=TRUE; {get the name of the output file; if it already exists, ask before overwriting} while inputCheck do begin inputCheck:=FALSE; writeln; write('Name of the main output file: '); readln(OutputFileString); if OutputFileString[length(OutputFileString)-3]<>'.' then OutputFileString:=OutputFileString+'.txt'; OutputFileFullString:=inputPath+OutputFileString; assign(myOutput, OutputFileFullString); {$I-} reset(myOutput); {$I+} if IOresult=0 then {file already exists} begin writeln('File "', outputfilestring, '" already exists. Do you want to overwrite it?'); write('Enter "y" or "n": '); readln(overWriteYes); if overWriteYes='Y' then overWriteYes:='y'; if overWriteYes<>'y' then begin inputCheck:=TRUE; close(myOutput); end; end; end; rewrite(myOutput); inputCheck:=TRUE; {get the name of the summary output file; if it already exists, ask before overwriting} while inputCheck do begin inputCheck:=FALSE; writeln; write('Name of the summary output file: '); readln(OutputFileString); if OutputFileString[length(OutputFileString)-3]<>'.' then OutputFileString:=OutputFileString+'.txt'; OutputFileFullString:=inputPath+OutputFileString; assign(mySummaryOutput, OutputFileFullString); {$I-} reset(mySummaryOutput); {$I+} if IOresult=0 then {file already exists} begin writeln('File "', outputFileString, '" already exists. Do you want to overwrite it?'); write('Enter "y" or "n": '); readln(overWriteYes); if overWriteYes='Y' then overwriteYes:='y'; if overWriteYes<>'y' then begin inputCheck:=TRUE; close(mySummaryOutput); end; end; end; rewrite(mySummaryOutput); DoAnalyzeInput; writeln; writeln; write('All done. Enter "q" to quit: '); readln(ActionChar); close(myInput); close(myOutput); close(mySummaryOutput); end.