#!/bin/bash
#SBATCH -t 48:00:00
#SBATCH --nodes=4 --ntasks-per-node=4
#SBATCH --mem=64g
#SBATCH --export=NONE
#SBATCH --mail-user=mramsey@uri.edu
#SBATCH --mail-type=BEGIN,END,FAIL

module load Bowtie2/2.3.5.1-GCC-8.3.0
module load HTSeq/0.11.2-foss-2019b-Python-3.7.4
module load R-bundle-Bioconductor/3.10-foss-2019b

#Set this to the output directory you want to work on everything in MAKE THIS DIRECTORY BEFORE STARTING
work_dir=/data/mramseylab/RNAseq/2024_valm/cmat_anaes/
#Set this to the fastq input file directory
fq_dir=/data/mramseylab/raw_reads/2024_valm/cmat_anaes/

#name of the input metadata file, must be created in specific way
filename=$work_dir'metadata.tsv'

genomedir=/data/mramseylab/genomes/
cmat1path=/data/mramseylab/genomes/cmat_atcc14266/
hpara1path=/data/mramseylab/genomes/hpara_392/
EL1path=/data/mramseylab/genomes/hpara_EL1/
#Note add more paths and genomes to the above list



mkdir alignment
mkdir figures
mkdir kegg_analysis
mkdir rnaseq_analysis

#This section scrapes the metadata file row by row, 
#each colum gets cut in order and used to assign variables 
#to the inputs for the rest of script
while read line
do
    #calls filename from column 1 of tsv to assign variable in the next function
    fastq=$(echo "$line" | cut -f1)
    #gets filename prefix so it can assign to R1 and R2 bowtie paired end reads
    PREFIX1="${fastq%R1.fastq.gz}"
    #gets informative name from tsv file column 2 to label htseq count output files for DeSeq2 script to run properly
    category=$(echo "$line" | cut -f2) 
    #below defines genome prefix name from tsv column 3
    index=$(echo "$line" | cut -f3)

    #This pulls column 3 prefix data and assigns to the right genome folder
    #FOR each new genome add path to directory at top of script and add prefix for bowtie genome indexing below
    case $index in
        cmatr)
            indexpath=$cmat1path
            ;;
        hpara)
            indexpath=$hpara1path
            ;;
        EL1)
            indexpath=$EL1path
            ;;
        *)
            indexpath=$nope
            echo "invalid genome prefix"
            ;;
    esac 




bowtie2 -x $indexpath"$index" -1 $fq_dir"$PREFIX1"R1.fastq.gz -2 $fq_dir"$PREFIX1"R2.fastq.gz -S $PREFIX1.sam
# for below line "-s reverse -t CDS" may not be correct
htseq-count -s reverse -t CDS -a 1 -m intersection-nonempty -i ID $PREFIX1.sam $index.gff > $category.txt

done < "$filename"


mv *.sam ./alignment    
Rscript deseq_tablesandkegg.R

