program AsymmetryCounter; {Written by: John H. McDonald} { Department of Biological Sciences} { University of Delaware} { Newark, Delaware 19716} { http://udel.edu/~mcdonald} { mcdonald@udel.edu} const maxGeneLength = 10000; var myInput: text; {the input data file} myOutput: text; {the output data file} myMatrixOutput: text; {the substitution matrix output file, for input into AsymmetryScaler} inputCheck: boolean; {used in checking the user input for duplicate file names, etc.} aaCountArray: array[1..2,1..22] of longint; {used in counting frequencies of amino acids in two species; [n, 22] is total} lastSlashPos: integer; {used in separating file path from file name} stringIndex: integer; {used in separating file path from file name} InputFileString: string[250]; {user-provided name of input file} OutputFileString: string[250]; {user-provided name of output file} outputFileFullString: string[250]; {path+name of output file} inputFileFullString: string[250]; {path+name of input file} inputPath: string[250]; {path of program file, prepended to user-input file names to make full names} actionChar: char; {y/n response from user} overWriteYes: char; {y/n response from user} {====================This function returns the greater of two integers.=====================} function GreaterInteger (x, y: integer): integer; var First, Greater: integer; begin First := x; Greater := y; if First > Greater then Greater := First; GreaterInteger := Greater; end; {=====================This function returns the lesser of two integers.=====================} function LesserInteger (x, y: integer): integer; var First, Lesser: integer; begin First := x; Lesser := y; if First < Lesser then Lesser := First; LesserInteger := Lesser; end; {====This function does exponentiation.=============} function RaiseTo (x, y: extended): extended; var Result: extended; begin Result:=exp(y*ln(x)); RaiseTo:=Result; end; {======This function returns a number times its natural log, or 0 if the number is 0.=====} function flnf (x: integer): extended; var F: integer; Result: extended; begin F := x; Result := 0; if F > 0 then Result := F * ln(F); flnf := Result; end; {=====This function does a G test of goodness-of-fit, comparing to an expected 50:50 ratio.====} function Gtest(x,y:longint):extended; var A,B:real; Result,expected:extended; begin A:=x*1.0; B:=y*1.0; expected:=(A+B)/2.0; Result:=0; if (A>0) and (B>0) then Result:=2*(A*ln(A/expected)+B*ln(B/expected)); if (A>0) and (B=0) then Result:=2*(A*ln(A/expected)); if (A=0) and (B>0) then Result:=2*(B*ln(B/expected)); if (A>0) and (B>0) then Result:=result/(1+1/(2*(A+B))); {Williams correction} Gtest:=Result; end; {=====This function does the exact binomial test, comparing to an expected 50:50 ratio.===} function Binomial(x,y:longint):extended; var N, S, T: longint; factIndex, sIndex: integer; Result, Top, Bottom, Binom:extended; begin N:=x+y; S:=LesserInteger(x,y); T:=GreaterInteger(x,y); Result:=RaiseTo(0.5,N); if S=T then Result:=0.5; if (S<>T) and (S>0) and (T>0) then begin for sIndex:=1 to S do begin binom:=RaiseTo(0.5,sIndex)*RaiseTo(0.5,N-sIndex); Top:=1; Bottom:=1; for factIndex:=(N-sIndex+1) to N do Top:=Top*factIndex; for factIndex:=1 to sIndex do Bottom:=Bottom*factIndex; Result:=Result+(top/bottom)*Binom; end; end; Binomial:=2*Result; end; {=======This function calculates the P-value for the chi-square with one degree of freedom,====} {=======by interpolating in a table of values. I'm sure there are better ways to do this, but it works.=====} function ChiProb(x: extended):extended; var P: array[0..300] of extended; result, chi: extended; begin P[0]:=0; P[1]:=-0.285245531; P[2]:=-0.423546323; P[3]:=-0.538055647; P[4]:=-0.640385376; P[5]:=-0.735011126; P[6]:=-0.824217529; P[7]:=-0.909355598; P[8]:=-0.991301572; P[9]:=-1.070661426; P[10]:=-1.147873503; P[11]:=-1.223269975; P[12]:=-1.297105078; P[13]:=-1.369581297; P[14]:=-1.440861504; P[15]:=-1.511080144; P[16]:=-1.580348535; P[17]:=-1.648760639; P[18]:=-1.716396445; P[19]:=-1.78332488; P[20]:=-1.849605143; P[21]:=-1.915289392; P[22]:=-1.980423497; P[23]:=-2.045047505; P[24]:=-2.109197646; P[25]:=-2.172905679; P[26]:=-2.236200706; P[27]:=-2.299108349; P[28]:=-2.361652416; P[29]:=-2.423854038; P[30]:=-2.485732506; P[31]:=-2.547305872; P[32]:=-2.608590238; P[33]:=-2.669600561; P[34]:=-2.730350508; P[35]:=-2.79085306; P[36]:=-2.85111986; P[37]:=-2.911161841; P[38]:=-2.970989053; P[39]:=-3.03061121; P[40]:=-3.090037008; P[41]:=-3.149274702; P[42]:=-3.208332022; P[43]:=-3.267216099; P[44]:=-3.325933986; P[45]:=-3.384491948; P[46]:=-3.442896009; P[47]:=-3.501151854; P[48]:=-3.559264843; P[49]:=-3.617240048; P[50]:=-3.675082263; P[51]:=-3.732795941; P[52]:=-3.790385584; P[53]:=-3.847855176; P[54]:=-3.905208603; P[55]:=-3.962449557; P[56]:=-4.019581556; P[57]:=-4.076607948; P[58]:=-4.133531929; P[59]:=-4.190356544; P[60]:=-4.247084616; P[61]:=-4.303719111; P[62]:=-4.360262591; P[63]:=-4.4167176; P[64]:=-4.473086576; P[65]:=-4.529371854; P[66]:=-4.585575675; P[67]:=-4.641700187; P[68]:=-4.697747454; P[69]:=-4.753719458; P[70]:=-4.809618104; P[71]:=-4.865445223; P[72]:=-4.92120258; P[73]:=-4.976891871; P[74]:=-5.032514731; P[75]:=-5.088072668; P[76]:=-5.143567341; P[77]:=-5.199000142; P[78]:=-5.254372484; P[79]:=-5.309685734; P[80]:=-5.364941208; P[81]:=-5.420140182; P[82]:=-5.475283886; P[83]:=-5.530373513; P[84]:=-5.585410213; P[85]:=-5.640395103; P[86]:=-5.695329261; P[87]:=-5.750213735; P[88]:=-5.805049535; P[89]:=-5.859837646; P[90]:=-5.914579018; P[91]:=-5.969274575; P[92]:=-6.023925212; P[93]:=-6.0785318; P[94]:=-6.133095181; P[95]:=-6.187616176; P[96]:=-6.242095581; P[97]:=-6.296534118; P[98]:=-6.350932644; P[99]:=-6.405291835; P[100]:=-6.459612403; P[101]:=-7.000837153; P[102]:=-7.538856708; P[103]:=-8.07414018; P[104]:=-8.607059703; P[105]:=-9.137915728; P[106]:=-9.666954294; P[107]:=-10.19437961; P[108]:=-10.72036304; P[109]:=-11.24504985; P[110]:=-11.76856426; P[111]:=-12.29101336; P[112]:=-12.8124901; P[113]:=-13.33307577; P[114]:=-13.85284176; P[115]:=-14.37185121; P[116]:=-14.89016018; P[117]:=-15.40781871; P[118]:=-15.92487165; P[119]:=-16.44135932; P[120]:=-16.95731816; P[121]:=-17.47278115; P[122]:=-17.98777831; P[123]:=-18.50233697; P[124]:=-19.01648214; P[125]:=-19.53023672; P[126]:=-20.04362177; P[127]:=-20.55665669; P[128]:=-21.06935938; P[129]:=-21.58174639; P[130]:=-22.09383308; P[131]:=-22.60563371; P[132]:=-23.11716152; P[133]:=-23.62842888; P[134]:=-24.13944733; P[135]:=-24.65022764; P[136]:=-25.16077991; P[137]:=-25.67111361; P[138]:=-26.18123763; P[139]:=-26.69116032; P[140]:=-27.20088954; P[141]:=-27.71043272; P[142]:=-28.21979684; P[143]:=-28.72898851; P[144]:=-29.23801398; P[145]:=-29.74687915; P[146]:=-30.25558964; P[147]:=-30.76415075; P[148]:=-31.27256753; P[149]:=-31.78084477; P[150]:=-32.28898703; P[151]:=-32.79699865; P[152]:=-33.30488375; P[153]:=-33.81264627; P[154]:=-34.32028998; P[155]:=-34.82781845; P[156]:=-35.33523512; P[157]:=-35.84254326; P[158]:=-36.34974601; P[159]:=-36.85684636; P[160]:=-37.36384718; P[161]:=-37.87075124; P[162]:=-38.37756117; P[163]:=-38.88427951; P[164]:=-39.39090869; P[165]:=-39.89745104; P[166]:=-40.40390882; P[167]:=-40.91028419; P[168]:=-41.41657922; P[169]:=-41.92279591; P[170]:=-42.4289362; P[171]:=-42.93500193; P[172]:=-43.44099491; P[173]:=-43.94691685; P[174]:=-44.45276941; P[175]:=-44.95855422; P[176]:=-45.46427282; P[177]:=-45.9699267; P[178]:=-46.47551733; P[179]:=-46.98104609; P[180]:=-47.48651435; P[181]:=-47.99192342; P[182]:=-48.49727456; P[183]:=-49.00256899; P[184]:=-49.50780793; P[185]:=-50.0129925; P[186]:=-50.51812384; P[187]:=-51.02320302; P[188]:=-51.52823109; P[189]:=-52.03320908; P[190]:=-52.53813797; P[191]:=-57.58492454; P[192]:=-62.62770369; P[193]:=-67.66710828; P[194]:=-72.70363215; P[195]:=-77.73766805; P[196]:=-82.76953349; P[197]:=-87.79948883; P[198]:=-92.82775021; P[199]:=-97.85449903; P[200]:=-102.879889; P[201]:=-107.9040516; P[202]:=-112.9271; P[203]:=-117.9491326; P[204]:=-122.9702349; P[205]:=-127.9904826; P[206]:=-133.009942; P[207]:=-138.0286724; P[208]:=-143.0467262; P[209]:=-148.0641509; P[210]:=-153.0809886; P[211]:=-158.0972777; P[212]:=-163.1130528; P[213]:=-168.1283454; P[214]:=-173.143184; P[215]:=-178.1575949; P[216]:=-183.1716021; P[217]:=-188.1852275; P[218]:=-193.1984915; P[219]:=-198.2114126; P[220]:=-203.2240082; P[221]:=-208.2362943; P[222]:=-213.2482857; P[223]:=-218.2599962; P[224]:=-223.2714387; P[225]:=-228.2826252; P[226]:=-233.2935668; P[227]:=-238.3042742; P[228]:=-243.314757; P[229]:=-248.3250246; P[230]:=-253.3350855; result:=1; if x<=10 then begin chi:=x*10; result:=exp(P[trunc(chi)]-(chi-trunc(chi))*(P[trunc(chi)]-P[trunc(chi+1)])); end; if (x>10) and (x<=100) then begin chi:=x+90; result:=exp(P[trunc(chi)]-(chi-trunc(chi))*(P[trunc(chi)]-P[trunc(chi+1)])); end; if (x>100) and (x<=500) then begin chi:=(x-100)/10+190; result:=exp(P[trunc(chi)]-(chi-trunc(chi))*(P[trunc(chi)]-P[trunc(chi+1)])); end; chiProb:=result; end; {=========This function converts amino acid symbols to numbers, for easier manipulation.================} function letterToNumber(x:char):integer; var letter:char; result:integer; begin letter:=x; case letter of 'A': result:=1; 'C': result:=2; 'D': result:=3; 'E': result:=4; 'F': result:=5; 'G': result:=6; 'H': result:=7; 'I': result:=8; 'K': result:=9; 'L': result:=10; 'M': result:=11; 'N': result:=12; 'P': result:=13; 'Q': result:=14; 'R': result:=15; 'S': result:=16; 'T': result:=17; 'V': result:=18; 'W': result:=19; 'Y': result:=20; 'X': result:=21; '-': result:=22; otherwise result:=23; end; letterToNumber:=result; end; {====This function converts numbers back into amino acid symbols, for output.====} function NumberToLetter(x:integer):string; var number:integer; result:string; begin number:=x; case number of 1: result:='A'; 2: result:='C'; 3: result:='D'; 4: result:='E'; 5: result:='F'; 6: result:='G'; 7: result:='H'; 8: result:='I'; 9: result:='K'; 10: result:='L'; 11: result:='M'; 12: result:='N'; 13: result:='P'; 14: result:='Q'; 15: result:='R'; 16: result:='S'; 17: result:='T'; 18: result:='V'; 19: result:='W'; 20: result:='Y'; 21: result:='X'; 22: result:='-'; otherwise result:='?'; end; numberToLetter:=result; end; {====Reads in the data from the input file, counts the number of each pair of amino acids, then outputs the results.======} procedure DoAnalyzeInput; var geneName: string; {the name of each gene} aminoAcid: char; {amino acid symbols, as read from the input file} bigArray:array[1..2,1..maxGeneLength] of integer; {array of amino acid sequences for one gene in two species} FirstBactIndex, SecondBactIndex, ArrayIndex, GeneIndex: longint; {indexes of for loops} AminoAcidCount1: longint; {number of amino acids in gene in species 1} AminoAcidCount2: longint; {number of amino acids in gene in species 2} numberAminoAcid: integer; {amino acid symbol translated into a number} TotalAB: longint; {total number of sites with amino acid A in sp. 1 and B in sp. 2} TotalBA: longint; {total number of sites with amino acid B in sp. 1 and A in sp. 2} statTest: extended; {G-value of the G-test of goodness-of-fit} pValue: extended; {P-value of the G-test or binomial test} totalArray:array[1..22,1..22] of longint; {substitution matrix--total number of substitutions in each direction} begin for arrayIndex:=1 to 22 do {initialize counting arrays} begin aaCountArray[1,arrayIndex]:=0; aaCountArray[2,arrayIndex]:=0; for secondBactIndex:=1 to 22 do totalArray[arrayIndex, secondBactIndex]:=0; end; read(myInput, aminoAcid); {reads the first '>' in the file} readln(myInput, geneName); geneIndex:=0; while not eof(myInput) do begin geneIndex:=geneIndex+1; aminoAcidCount1:=0; aminoAcid:='-'; while (aminoAcid<>'>') and (not eof(myInput)) do {">"=start of gene from other species} begin read(myInput, aminoAcid); numberAminoAcid:=lettertonumber(aminoAcid); if numberAminoAcid<22 then {22 is '-' or lowercase letter} begin aminoAcidCount1:=aminoAcidCount1+1; aaCountArray[1,numberAminoAcid]:=aaCountArray[1,numberAminoAcid]+1; aaCountArray[1,22]:=aaCountArray[1,22]+1; bigArray[1,aminoAcidCount1]:=numberAminoAcid; end; end; aminoAcid:='-'; readln(myInput);{start of gene from second species} aminoAcidCount2:=0; while (aminoAcid<>'>') and (not eof(myInput)) do {">"=start of next gene} begin read(myInput,aminoAcid); numberAminoAcid:=lettertonumber(aminoAcid); if numberAminoAcid<22 then begin aminoAcidCount2:=AminoAcidCount2+1; aaCountArray[2,numberAminoAcid]:=aaCountArray[2,numberAminoAcid]+1; aaCountArray[2,22]:=aaCountArray[2,22]+1; bigArray[2,aminoAcidCount2]:=numberAminoAcid; if (bigArray[1,AminoAcidCount2]<22) and (bigArray[2,AminoAcidCount2]<22) then begin totalArray[bigArray[1, aminoAcidCount2],bigArray[2, aminoAcidCount2]]:= totalArray[bigArray[1, aminoAcidCount2],bigArray[2, aminoAcidCount2]]+1; end; end; end; if not eof(myInput) then readln(myInput,geneName); {start with next gene} end; writeln(myOutput,'Amino Acid Composition at unambiguously aligned sites'); {start writing totals to output file} writeln(myOutput); writeln(myOutput,'AA sp. 1 sp. 2 '); writeln(myOutput,'-- ------- -------'); for arrayIndex:=1 to 21 do writeln(myOutput,numbertoletter(arrayIndex):2, (aaCountArray[1,arrayIndex]/aaCountArray[1,22]):11:5, (aaCountArray[2,arrayIndex]/aaCountArray[2,22]):10:5); writeln(myOutput); writeln(myOutput,'total', aaCountArray[1,22]:8, aaCountArray[2,22]:10); writeln(myOutput); writeln(myOutput); writeln(myOutput,' sp. 1-->sp. 2 G-value P'); writeln(myOutput,'-------------------------------- -------- ---------'); for FirstBactIndex:=1 to 20 do begin for SecondBactIndex:=FirstBactIndex to 20 do begin write(myOutput,NumberToLetter(FirstBactIndex), '-->',NumberToLetter(SecondBactIndex),' '); TotalAB:=totalArray[firstBactIndex, secondBactIndex]; write(myOutput, TotalAB:5, ' '); TotalBA:=totalArray[secondBactIndex, firstBactIndex]; write(myOutput,NumberToLetter(SecondBactIndex), '-->',NumberToLetter(FirstBactIndex),' ', totalBA:5); if (TotalBA+TotalAB)<50 then {do binomial test of goodness-of-fit} begin Pvalue:=binomial(totalAB, totalBA); if Pvalue>=0.00001 then write(myOutput, ' ----', ' ', Pvalue:12:7); if Pvalue<0.00001 then write(myOutput, ' ----', ' ',Pvalue:12); end; if (totalBA+totalAB)>49 then {do G-test of goodness-of-fit} begin statTest:=Gtest(totalAB, totalBA); if statTest<454 then begin Pvalue:=chiProb(statTest); if Pvalue>=0.00001 then write(myOutput, ' ', statTest:10:4, ' ',Pvalue:12:7); if Pvalue<0.00001 then write(myOutput, ' ', statTest:10:4, ' ',Pvalue:12); end; if statTest>=454 then write(myOutput, ' ', statTest:10:4, ' <1.00E-100'); end; if Pvalue<0.05 then {write one, two or three stars for significant P-values} write(myOutput, '*'); if Pvalue<0.01 then write(myOutput, '*'); if Pvalue<0.001 then write(myOutput, '*'); writeln(myOutput); end; end; writeln(myOutput); for FirstBactIndex:=1 to 21 do {write results for amino acid symbol 'X' to output file} begin for SecondBactIndex:=21 to 21 do begin write(myOutput,NumberToLetter(FirstBactIndex), '-->',NumberToLetter(SecondBactIndex),' '); TotalAB:=totalArray[firstBactIndex, secondBactIndex]; write(myOutput, TotalAB:5, ' '); TotalBA:=totalArray[secondBactIndex, firstBactIndex]; write(myOutput,NumberToLetter(SecondBactIndex), '-->',NumberToLetter(FirstBactIndex),' ', totalBA:5); if (TotalBA+TotalAB)<50 then begin Pvalue:=binomial(totalAB, totalBA); if Pvalue>=0.00001 then write(myOutput, ' ----', ' ',Pvalue:12:7); if Pvalue<0.00001 then write(myOutput, ' ----', ' ',Pvalue:12); end; if (totalBA+totalAB)>49 then begin statTest:=Gtest(totalAB, totalBA); if statTest<454 then begin Pvalue:=chiProb(statTest); if Pvalue>=0.00001 then write(myOutput, ' ', statTest:10:4, ' ',Pvalue:12:7); if Pvalue<0.00001 then write(myOutput, ' ', statTest:10:4, ' ',Pvalue:12); end; if statTest>=454 then write(myOutput, ' ', statTest:10:4, ' <1.00E-100'); end; if Pvalue<0.05 then write(myOutput, '*'); if Pvalue<0.01 then write(myOutput, '*'); if Pvalue<0.001 then write(myOutput, '*'); writeln(myOutput); end; end; writeln(myMatrixOutput, ' amino acid in sp. 2'); {start writing results to substitution matrix file} write(myMatrixOutput, 'sp. 1 '); for arrayIndex:=1 to 20 do write(myMatrixOutput, NumberToLetter(arrayIndex):6); writeln(myMatrixOutput); for FirstBactIndex:=1 to 20 do begin write(myMatrixOutput, NumberToLetter(firstBactIndex):6); for SecondBactIndex:=1 to 20 do write(myMatrixOutput,totalArray[firstBactIndex,secondBactIndex]:6); writeln(myMatrixOutput); end; end; begin writeln('================================================================='); writeln; writeln(' Welcome to AsymmetryCounter.'); writeln; writeln('For help on formatting the input file and other documentation, '); writeln(' or a version of this program for a different operating system, '); writeln(' see http://udel.edu/~mcdonald/asymmetryprograms.html.'); writeln; inputCheck:=TRUE; {get the name of the input file; if it doesn't exist, keep trying} while inputCheck do begin writeln('The input file must be a plain text file (extension .txt)'); writeln(' in the same folder as this program.'); writeln; write('Name of the input file: '); readln(InputFileString); if InputFileString[length(InputFileString)-3]<>'.' then InputFileString:=InputFileString+'.txt'; {The next few lines get the full path to the program, remove the name of the program, then add the name of the input file. The result is that "inputFileString" is converted from "input.txt" to "/Users/username/sliderfolder/input.txt" (on a Mac) or "C:\sliderfolder\input.txt" (on Windows).} inputPath:=paramStr(0); lastSlashPos:=0; for stringIndex:=1 to length(inputPath) do begin if (inputPath[stringIndex]='/') or (inputPath[stringIndex]='\') then lastSlashPos:=stringIndex; end; delete(inputPath, lastSlashPos+1, length(inputPath)-lastSlashPos); inputFileFullString:=inputPath+inputFileString; assign(myInput, inputfilefullstring); {$I-} reset(myInput); {$I+} if IOresult=0 then inputCheck:=FALSE; if inputCheck then writeln('I can''t find a file named "',inputfilestring,', try again.'); end; inputCheck:=TRUE; {get the name of the output file; if it already exists, ask before overwriting} while inputCheck do begin inputCheck:=FALSE; writeln; write('Name of the main output file: '); readln(OutputFileString); if OutputFileString[length(OutputFileString)-3]<>'.' then OutputFileString:=OutputFileString+'.txt'; OutputFileFullString:=inputPath+OutputFileString; assign(myOutput, OutputFileFullString); {$I-} reset(myOutput); {$I+} if IOresult=0 then {file already exists} begin writeln('File "', outputfilestring, '" already exists. Do you want to overwrite it?'); write('Enter "y" or "n": '); readln(overWriteYes); if overWriteYes='Y' then overWriteYes:='y'; if overWriteYes<>'y' then begin inputCheck:=TRUE; close(myOutput); end; end; end; rewrite(myOutput); inputCheck:=TRUE; {get the name of the matrix output file; if it already exists, ask before overwriting} while inputCheck do begin inputCheck:=FALSE; writeln; write('Name of the substitution matrix output file: '); readln(OutputFileString); if OutputFileString[length(OutputFileString)-3]<>'.' then OutputFileString:=OutputFileString+'.txt'; OutputFileFullString:=inputPath+OutputFileString; assign(myMatrixOutput, OutputFileFullString); {$I-} reset(myMatrixOutput); {$I+} if IOresult=0 then {file already exists} begin writeln('File "', outputFileString, '" already exists. Do you want to overwrite it?'); write('Enter "y" or "n": '); readln(overWriteYes); if overWriteYes='Y' then overwriteYes:='y'; if overWriteYes<>'y' then begin inputCheck:=TRUE; close(myMatrixOutput); end; end; end; rewrite(myMatrixOutput); DoAnalyzeInput; writeln; writeln; write('All done. Enter "q" to quit: '); readln(ActionChar); close(myMatrixOutput); close(myOutput); close(myInput); end.