#!/usr/bin/perl # TninsertCounting_v4.pl # By Kathryn Ramsey (ramsey.kathryn.m@gmail.com) # Simon L. Dove Lab, Children's Hospital Boston, Harvard Medical School # March 21, 2018 # Modified version 4: 4/24/18 # Program requirements: # ".gff" in the same directory # "_ARTIST_Anno.txt" in the same directory (output from Annot_forARTIST_.pl program) # Program outputs: # _TnCounts.wig represents all the mapped transposon insertions # _HighConfCounts.wig" represents only the high confidence transposon insertions # _TAtally.txt" reports each TA site in the chromosome with # (a) corresponding annotated region and # (b) the number of mapped high confidence transposon insertions # _summary.txt" reports the size of the chromosome, the number of TA sites, and percent of total and high confidence TA sites hit. # USAGE: TninsertCounting_v4.pl .sam use strict; use warnings; my ($samfile, $outname, $outfile1, $outfile2, $outfile3, $sumfile, $TAtally); $samfile = shift @ARGV; unless ($samfile =~ /\.sam$/) {die "TninsertCounting.pl - INPUT ERROR!\n";} if ($samfile =~ /([0-9a-zA-Z_]*).sam/) { $outname = $1; $outfile1 = $outname."_TnCounts.wig"; $outfile2 = $outname."_HighConfCounts.wig"; $outfile3 = $outname."_TAtally.txt"; $sumfile = $outname."_summary.txt";} else { die "Error! I didn't recognize the input well enough to define an output file name: $!\n";} my ($line, $gname, $glength, $loc, $rlen, $fastafile, $genome_seq, %bp, @line, @headers); my $count = 0; open SAM, $samfile or die "Could not open '$samfile': $!\n"; while () { chomp; $line = $_; @line = split; $count++; if ($count == 2) { if ($line[1] =~ /SN:([0-9a-zA-Z_]+)$/) { $gname = $1; $fastafile = $gname.".fasta"; } if ($line[2] =~ /LN:([0-9]+)$/) { $glength = $1; } } elsif ($count > 3) { if ($line[1] == '0') { #If the read maps to the plus strand $rlen = length($line[9]); $loc = $line[3] + $rlen - 1; $bp{"$loc"} ++; #Only count inserts on the plus strand "T" of the AT dinucleotide! } if ($line[1] == '16') { #If the read maps to the minus strand $loc = $line[3]; $bp{"$loc"} ++; } } } close SAM; open FASTA, $fastafile or die "I couldn't open the fasta file that corresponds to this genome: $!\n"; @headers = split /\|/, ; while () { chomp; $genome_seq = $genome_seq . $_; } close FASTA; my (%anno, @geneline, $curT, $curA, $curTA, %allTA, %confTA, %TAtally); my $search = 0; my $TA = 0; my $TAcount = 0; my $Annofile = $gname."_ARTIST_Anno.txt"; open ANNO, $Annofile or die "I couldn't open the annotation file ($Annofile) that corresponds to this genome: $!\n"; while () { chomp; @geneline = split; $anno{$geneline[0]} = $geneline[1]; } close ANNO; open OUT3, ">$outfile3" or die "Could not open $outfile3': $!\n"; print OUT3 "Chromosome\tTAsite\tRegion\tCounts\n"; while ($TA != -1) { $TA = index($genome_seq, "TA", $search); #Note that $search sets where to start looking for the next TA dinucleotide if ($TA == -1) {last;} $TAcount++; $curT = $TA+1; $curA = $TA+2; $TAtally{$TAcount} = $curT; #The location of all the TA sites in the genome $search = $curA; if (exists $bp{$curT} && exists $bp{$curA}) { $allTA{$curT} = $bp{$curT} + $bp{$curA}; #All identified insertions at TA sites $confTA{$curT} = $allTA{$curT}; #Only insertions at TA sites with reads mapping from both ends of the transposon print OUT3 "$gname\t$curT\t".$anno{$curT}."\t".$confTA{$curT}."\n"; } elsif (exists $bp{$curT}) { $allTA{$curT} = $bp{$curT}; print OUT3 "$gname\t$curT\t".$anno{$curT}."\t0\n"; } elsif (exists $bp{$curA}) { $allTA{$curT} = $bp{$curA}; print OUT3 "$gname\t$curT\t".$anno{$curT}."\t0\n"; } else { print OUT3 "$gname\t$curT\t".$anno{$curT}."\t0\n"; } } close OUT3; open OUT1, ">$outfile1" or die "Could not open $outfile1': $!\n"; open OUT2, ">$outfile2" or die "Could not open $outfile2': $!\n"; open SUMMARY, ">$sumfile" or die "Could not open $sumfile': $!\n"; my ($curbp, $TApercent, $ConfTApercent); my $TAcount2 = 0; my $TAcount3 = 0; ##To create a file with only the (high-confidence) TA sites and counts: print OUT1 "track type=wiggle_0 name=$outname description=$outfile1 visibility=full color=158,67,161 priority=82 maxHeightPixels=50:50:11 variableStep chrom=$gname span=1\n"; print OUT2 "track type=wiggle_0 name=$outname\_HighConf description=$outfile2 visibility=full color=158,67,161 priority=82 maxHeightPixels=50:50:11 variableStep chrom=$gname span=1\n"; foreach $curbp (1..$glength) { if (exists $allTA{$curbp}) { $TAcount2++; print OUT1 "$curbp\t".$allTA{$curbp}."\n"; if (exists $confTA{$curbp}){ $TAcount3++; print OUT2 "$curbp\t".$confTA{$curbp}."\n"; } else { print OUT2 "$curbp\t0\n"; } } else { print OUT1 "$curbp\t0\n"; print OUT2 "$curbp\t0\n"; } } $TApercent = ($TAcount2/$TAcount); $ConfTApercent = ($TAcount3/$TAcount); print SUMMARY "Chromosome:\t$gname\nSize:\t$glength\nTA sites:\t$TAcount\nNumber of TA sites hit:\t$TAcount2\nPercent TA sites hit:\t$TApercent Number of TA sites hit with high confidence:\t$TAcount3\nPercent of TA sites hit with high confidence:\t$ConfTApercent\n"; close OUT1; close OUT2; close SUMMARY;