program AmbiguityRemover;

{Written by:       John H. McDonald}
{                  Department of Biological Sciences}
{                  University of Delaware}
{                  Newark, Delaware  19716}
{                  http://udel.edu/~mcdonald}
{                  mcdonald@udel.edu}


  const
     MaxGeneLength = 10000;

  var
     myInput: text; {the input data file}
     myOutput: text; {the main output data file}
     mySummaryOutput: text; {the summary output file}
     deGapYes: char; {y/n response from user} 
	 overWriteYes: char; {y/n response from user} 
     actionChar: char; {y/n response from user}
     inputCheck: boolean; {used in checking the user input for duplicate file names, etc.}
     gapEdgeNum: integer; {number of identical sites marking edge of ambiguous region; input from user}      
     lastSlashPos: integer; {used in separating file path from file name}
     stringIndex: integer; {used in separating file path from file name}
     InputFileString: string[250]; {user-provided name of input file}
     OutputFileString: string[250]; {user-provided name of output file}
     outputFileFullString: string[250]; {path+name of output file}
     inputFileFullString: string[250]; {path+name of input file}
     inputPath: string[250]; {path of program file, prepended to user-input file names to make full names} 

{=========This function converts amino acid symbols to numbers, for easier manipulation.================}
  function letterToNumber(x:char):integer;
     var
        letter:char;
        result:integer;
     begin
     letter:=x;
     case letter of
        'A': result:=1;
		'C': result:=2;
		'D': result:=3;
		'E': result:=4;
		'F': result:=5;
		'G': result:=6;
		'H': result:=7;
		'I': result:=8;
		'K': result:=9;
		'L': result:=10;
		'M': result:=11;
		'N': result:=12;
		'P': result:=13;
		'Q': result:=14;
		'R': result:=15;
		'S': result:=16;
		'T': result:=17;
		'V': result:=18;
		'W': result:=19;
		'Y': result:=20;
		'-': result:=21;
		'X': result:=22;
	 otherwise
		result:=23;
		end;
	 letterToNumber:=result;
     end;

{====This function converts numbers back into amino acid symbols, for output.====}
  function NumberToLetter(x:integer):string;
     var
        number:integer;
        result:string;
     begin
        number:=x;
        case number of

        1: result:='A';
		2: result:='C';
		3: result:='D';
		4: result:='E';
		5: result:='F';
		6: result:='G';
		7: result:='H';
		8: result:='I';
		9: result:='K';
		10: result:='L';
		11: result:='M';
		12: result:='N';
		13: result:='P';
		14: result:='Q';
		15: result:='R';
		16: result:='S';
		17: result:='T';
		18: result:='V';
		19: result:='W';
		20: result:='Y';

		21: result:='-';
		22: result:='X';

		27: result:='a';
		28: result:='c';
		29: result:='d';
		30: result:='e';
		31: result:='f';
		32: result:='g';
		33: result:='h';
		34: result:='i';
		35: result:='k';
		36: result:='l';
		37: result:='m';
		38: result:='n';
		39: result:='p';
		40: result:='q';
		41: result:='r';
		42: result:='s';
		43: result:='t';
		44: result:='v';
		45: result:='w';
		46: result:='y';

		47: result:='-';
		48: result:='x';
	 otherwise
		result:='?';
		end;
	 numberToLetter:=result;
     end;

{====Reads in the data from the input file and puts ambiguously aligned sites next to gaps in lower case. ===}
  procedure DoAnalyzeInput;
	 var
		geneName1, geneName2: string; {the name of each gene}
		aminoAcid: char; {amino acid symbols, as read from the input file}
		geneCount: longint; {number of genes} 
		arrayIndex: longint; {index used in loops} 
		AminoAcidCount1: longint;  {number of amino acids in gene in species 1}
		AminoAcidCount2: longint;  {number of amino acids in gene in species 2}
		bigArray:array[1..2,1..maxGeneLength] of integer; {array of amino acid sequences for one gene in two species}
		numberAminoAcid: integer; {amino acid symbol translated into a number}
		edgeIndex: integer; {index used in loops}
		matchCount: longint; {number of matching amino acids in a window gapEdgeNum wide}
		spaceIndex: integer; {index used in loops}
		addedSpace: integer; {used in making gene names 10 characters long}
		totalSites: longint; {total number of amino acids in a gene}
		totalDiffs: longint; {total number of sites with different amino acids between species in a gene}
		alignedSites: longint; {total number of amino acids in a gene that are unambiguously aligned}
		alignedDiffs: longint; {total number of unambiguously aligned sites with different amino acids between species in a gene}
     begin
        read(myInput, aminoAcid); {reads first '>'}
        readln(myInput,geneName1);
		
        if length(geneName1)>10 then {make gene name ten characters long}
           delete(geneName1, 11, (length(geneName1)-10));
        if length(geneName1)<10 then
           begin
           addedSpace:=10-length(geneName1);
           for spaceIndex:=1 to addedSpace do
              geneName1:=geneName1+' ';
           end;
		   
        geneCount:=0;
        writeln(mySummaryOutput, '      gene name           total length        without gaps   unambig. aligned');
        writeln(mySummaryOutput, '-------------------       ------------       -------------   ----------------');
        writeln(mySummaryOutput, '   sp. 1      sp. 2       sp. 1  sp. 2       diff.  length      diff.  length');
        while not eof(myInput) do
            begin
            aminoAcidCount1:=0;
            aminoAcid:='X';
            geneCount:=geneCount+1;
            while (aminoAcid<>'>') and (not eof(myInput)) do {'>'=start of gene from second species}
                begin
                read(myInput,aminoAcid);
                numberAminoAcid:=letterToNumber(aminoAcid);
                if numberAminoAcid<23 then
                    begin
                    aminoAcidCount1:=AminoAcidCount1+1;
                    bigArray[1,aminoAcidCount1]:=lettertonumber(aminoAcid); {convert letters to numbers for easier manipulation}
                    end;
                end;

            aminoAcid:='X';
            readln(myInput,geneName2); {start of gene from second species}
            if length(geneName2)>10 then
               delete(geneName2, 11, (length(geneName2)-10));
            if length(geneName2)<10 then
               begin
               addedSpace:=10-length(geneName2);
               for spaceIndex:=1 to addedSpace do
                  geneName2:=geneName2+' ';
               end;

            aminoAcidCount2:=0;
            while (aminoAcid<>'>') and (not eof(myInput)) do {'>'=start of next gene}
			   begin
			   read(myInput,aminoAcid);
			   numberAminoAcid:=letterToNumber(aminoAcid);
			   if numberAminoAcid<23 then
				  begin
                  aminoAcidCount2:=AminoAcidCount2+1;
                  bigArray[2,aminoAcidCount2]:=lettertonumber(aminoAcid);
				  end;
			   end;

            if aminoAcidCount1=aminoAcidCount2 then {if not equal, there is a problem}
			   begin
			   for arrayIndex:=1 to aminoAcidCount1 do {make sites aligned with gaps lower case}
				  begin
				  if (bigArray[1,arrayIndex]=21) or (bigArray[2,arrayIndex]=21) then
					 begin
					 bigArray[1,arrayIndex]:=bigArray[1,arrayIndex]+26;
					 bigArray[2,arrayIndex]:=bigArray[2,arrayIndex]+26;
					 end;
				  end;

 {go forward through the aligned sequences from gap to N adjacent matching sites}
			   for arrayIndex:=1 to (aminoAcidCount1-gapEdgeNum) do
				  begin
				  if bigArray[1,arrayIndex]>26 then {there is a "-" in one species}
					 begin
					 matchCount:=0;
					 for edgeIndex:=(arrayIndex+1) to (arrayIndex+gapEdgeNum) do
						begin
						if (bigArray[1, edgeIndex]<21) and (bigArray[1, edgeIndex]=bigArray[2, edgeIndex]) then
						   matchCount:=matchCount+1;
						end;
					 if (matchCount<gapEdgeNum) and (bigArray[1, arrayIndex+1]<26) then
						begin
						bigArray[1,arrayIndex+1]:=bigArray[1,arrayIndex+1]+26;
						bigArray[2,arrayIndex+1]:=bigArray[2,arrayIndex+1]+26;
						end;
					 end;
				  end;

                {make the last few sites lowercase if adjacent to a gap}
			   if (bigArray[1, aminoAcidCount1-gapEdgeNum+1]>20) and (gapEdgeNum>1) then
				  begin
				  for edgeIndex:=(aminoAcidCount1-gapEdgeNum+2) to aminoAcidCount1 do
					 begin
					 if bigArray[1, edgeIndex]<26 then
						begin
						bigArray[1, edgeIndex]:=bigArray[1, edgeIndex]+26;
						bigArray[2, edgeIndex]:=bigArray[2, edgeIndex]+26;
						end;
					 end;
				  end;

{go backwards through the aligned sequences from gap to adjacent matching sites}
			   for arrayIndex:=aminoAcidCount1 downto (gapEdgeNum+1) do
				  begin
				  if bigArray[1,arrayIndex]>26 then {there is a "-" in one species}
					 begin
					 matchCount:=0;
					 for edgeIndex:=(arrayIndex-1) downto (arrayIndex-gapEdgeNum) do
						begin
						if (bigArray[1, edgeIndex]<21) and (bigArray[1, edgeIndex]=bigArray[2, edgeIndex]) then
						   matchCount:=matchCount+1;
						end;
					 if (matchCount<gapEdgeNum) and (bigArray[1, arrayIndex-1]<26) then
						begin
						bigArray[1,arrayIndex-1]:=bigArray[1,arrayIndex-1]+26;
						bigArray[2,arrayIndex-1]:=bigArray[2,arrayIndex-1]+26;
						end;
					 end;
				  end;

                {make the first few sites lowercase if adjacent to a gap}
			   if (bigArray[1, gapEdgeNum]>20) and (gapEdgeNum>1) then
				  begin
				  for edgeIndex:=(gapEdgeNum-1) downto 1 do
					 begin
					 if bigArray[1, edgeIndex]<26 then
						begin
						bigArray[1, edgeIndex]:=bigArray[1, edgeIndex]+26;
						bigArray[2, edgeIndex]:=bigArray[2, edgeIndex]+26;
						end;
					 end;
				  end;
					
                {tidy up the ends}
			   for arrayIndex:=(aminoAcidCount1-gapEdgeNum+1) to aminoAcidCount1 do
				  begin
				  if bigArray[1, arrayIndex]>21 then
					 begin
					 for edgeIndex:=arrayIndex to aminoAcidCount1 do
						begin
						if bigArray[1, edgeIndex]<26 then
						   begin
						   bigArray[1, edgeIndex]:=bigArray[1, edgeIndex]+26;
						   bigArray[2, edgeIndex]:=bigArray[2, edgeIndex]+26;
						   end;
						end;
					 end;
				  end;
			   for arrayIndex:=gapEdgeNum downto 1 do
				  begin
				  if bigArray[1, arrayIndex]>21 then
					 begin
					 for edgeIndex:=arrayIndex downto 1 do
						begin
						if bigArray[1, edgeIndex]<26 then
						   begin
						   bigArray[1, edgeIndex]:=bigArray[1, edgeIndex]+26;
						   bigArray[2, edgeIndex]:=bigArray[2, edgeIndex]+26;
						   end;
						end;
					 end;
				  end;

{put sites with 'X' in either species in lower case}					
			   for arrayIndex:=1 to aminoAcidCount1 do
				  begin
				  if (bigArray[1, arrayIndex]=22) or (bigArray[2, arrayIndex]=22) then
					 begin
					 bigArray[1, arrayIndex]:=bigArray[1, arrayIndex]+26;
					 bigArray[2, arrayIndex]:=bigArray[2, arrayIndex]+26;
					 end;
				  end;	
			   end; {if aminocount1=aminocount2}

            writeln(myOutput,'>',geneName1);
            for arrayIndex:=1 to aminoAcidCount1 do
			   write(myOutput,numbertoletter(bigArray[1,arrayIndex]));
            writeln(myOutput);
            writeln(myOutput,'>',geneName2);
            for arrayIndex:=1 to aminoAcidCount2 do
			   write(myOutput,numbertoletter(bigArray[2,arrayIndex]));
            writeln(myOutput);

{count differences for the summary output}
            if (aminoAcidCount1=aminoAcidCount2) then
               begin
			   totalSites:=0;
			   totalDiffs:=0;
			   alignedSites:=0;
			   alignedDiffs:=0;
			   for arrayIndex:=1 to aminoAcidCount1 do
				  begin
				  if (bigArray[1, arrayIndex]<>47) and (bigArray[2, arrayIndex]<>47) then
					 begin
					 totalSites:=totalSites+1;
					 if bigArray[1, arrayIndex]<>bigArray[2, arrayIndex] then
						totalDiffs:=totalDiffs+1;
					 if bigArray[1, arrayIndex]<21 then
						begin
						alignedSites:=alignedSites+1;
						if bigArray[1, arrayIndex]<>bigArray[2, arrayIndex] then
						   alignedDiffs:=alignedDiffs+1;
						end;
					 end;
				  end;
			   writeln(mySummaryOutput, geneName1:10, ' ', geneName2:10, ' ', aminoAcidCount1:9, aminoAcidCount2:7, totalDiffs/totalSites:12:3, totalSites:6, alignedDiffs/alignedSites:13:3, alignedSites:6);
			   end;
            if (aminoAcidCount1<>aminoAcidCount2) then
			   writeln(mySummaryOutput, geneName1:10, ' ', geneName2:10, ' ', aminoAcidCount1:9, aminoAcidCount2:7, '           0', '     0', '           0', '     0');

            if not eof(myInput) then
               begin
               readln(myInput,geneName1);
               if length(geneName1)>10 then
                  delete(geneName1, 11, (length(geneName1)-10));
               if length(geneName1)<10 then
                  begin
                  addedSpace:=10-length(geneName1);
                  for spaceIndex:=1 to addedSpace do
                     geneName1:=geneName1+' ';
                  end;
               end;
            end; {while not eof}
         end;


begin

  writeln('=================================================================');
  writeln;
  writeln('                  Welcome to AmbiguityRemover.');
  writeln;
  writeln('For help on formatting the input file and other documentation, ');
  writeln(' or a version of this program for a different operating system, ');
  writeln(' see http://udel.edu/~mcdonald/asymmetryprograms.html.');
  writeln;
  writeln;
  writeln('Would you like to ignore sites adjacent to gaps in the alignment? ');
  write('Enter "y" or "n": ');
  readln(deGapYes);
  if deGapYes='Y' then
	 deGapYes:='y';
  if deGapYes<>'y' then
	 deGapYes:='n';
  writeln;

  if deGapYes='y' then
	 begin
	 inputCheck:=TRUE;
	 while inputCheck do
		begin
		writeln('How many adjacent aligned sites mark the edge of an ambiguously aligned region?');
		write('Enter a number from 1 to 10: ');
		readln(gapEdgeNum);
		writeln;
		if (gapEdgeNum>0) and (gapEdgeNum<11) then
		   inputCheck:=FALSE;
		if inputCheck then
		   writeln('Enter a number from 1 to 10: ');
		end;
	 end;

  writeln;
  inputCheck:=TRUE; {get the name of the input file; if it doesn't exist, keep trying}
  while inputCheck do
     begin
     writeln('The input file must be a plain text file (extension .txt)');
     writeln('   in the same folder as this program.');
     writeln;
     write('Name of the input file: ');
     readln(InputFileString);
     if InputFileString[length(InputFileString)-3]<>'.' then
        InputFileString:=InputFileString+'.txt';

{The next few lines get the full path to the program, remove the name of the program, then
 add the name of the input file. The result is that "inputFileString" is converted from
 "input.txt" to "/Users/username/sliderfolder/input.txt" (on a Mac) or "C:\sliderfolder\input.txt" (on Windows).}
     inputPath:=paramStr(0);
     lastSlashPos:=0;
     for stringIndex:=1 to length(inputPath) do
        begin
	    if (inputPath[stringIndex]='/') or (inputPath[stringIndex]='\') then
	       lastSlashPos:=stringIndex;
	    end;
     delete(inputPath, lastSlashPos+1, length(inputPath)-lastSlashPos);
     inputFileFullString:=inputPath+inputFileString;
     assign(myInput, inputFileFullString);
     {$I-}
        reset(myInput);
     {$I+}
     if IOresult=0 then
        inputCheck:=FALSE;
	 if inputCheck then 
        writeln('I can''t find a file named "',inputFileString,'", try again.'); 
     end;

  inputCheck:=TRUE; {get the name of the output file; if it already exists, ask before overwriting}
  while inputCheck do 
     begin 
     inputCheck:=FALSE; 
     writeln; 
     write('Name of the main output file: '); 
     readln(OutputFileString); 
     if OutputFileString[length(OutputFileString)-3]<>'.' then 
        OutputFileString:=OutputFileString+'.txt'; 
     OutputFileFullString:=inputPath+OutputFileString; 
     assign(myOutput, OutputFileFullString); 
     {$I-} 
        reset(myOutput); 
     {$I+} 
     if IOresult=0 then {file already exists} 
        begin 
        writeln('File "', outputfilestring, '" already exists. Do you want to overwrite it?'); 
        write('Enter "y" or "n": '); 
        readln(overWriteYes); 
        if overWriteYes='Y' then 
           overWriteYes:='y'; 
        if overWriteYes<>'y' then 
           begin 
           inputCheck:=TRUE; 
           close(myOutput); 
           end; 
        end; 
     end; 
  rewrite(myOutput); 


  inputCheck:=TRUE; {get the name of the summary output file; if it already exists, ask before overwriting}
  while inputCheck do 
     begin 
     inputCheck:=FALSE; 
     writeln; 
     write('Name of the summary output file: '); 
	 readln(OutputFileString); 
     if OutputFileString[length(OutputFileString)-3]<>'.' then 
        OutputFileString:=OutputFileString+'.txt'; 
     OutputFileFullString:=inputPath+OutputFileString; 
     assign(mySummaryOutput, OutputFileFullString); 
     {$I-} 
        reset(mySummaryOutput); 
     {$I+} 
     if IOresult=0 then {file already exists} 
        begin 
        writeln('File "', outputFileString, '" already exists. Do you want to overwrite it?'); 
        write('Enter "y" or "n": '); 
        readln(overWriteYes); 
        if overWriteYes='Y' then 
           overwriteYes:='y'; 
        if overWriteYes<>'y' then 
           begin 
           inputCheck:=TRUE; 
           close(mySummaryOutput); 
           end; 
        end; 
     end; 
  rewrite(mySummaryOutput); 


  DoAnalyzeInput;

  writeln;
  writeln;
  write('All done. Enter "q" to quit: ');
  readln(ActionChar);

  close(myInput);
  close(myOutput);
  close(mySummaryOutput);

end.



