# Updated on Jan-25-2022, by CX Qiu

#!/bin/bash
# this scRNA-seq pipeline accept a input folder, and then use the default parameter for the data processing and analysis

run_ID="run_28_birth_20221208"

my_folder="/net/shendure/vol2/projects/cxqiu/JAX_rna"

num=${SGE_TASK_ID}

# define the fastq folder including all fastq files
fastq_folder="${my_folder}/${run_ID}/nobackup/fastq"

#make sure not to put a slash on the end of this path
work_folder="/net/shendure/vol8/projects/cxqiu/${run_ID}/nobackup/output_${num}"

# define the PCR group id after demultiplexing
sample_ID="${my_folder}/${run_ID}/sample_ID_${num}.txt"

# define the core number for parallel processing
core=12
core_sam=3

# define the number of unique reads cutoff for splitting single cell
cutoff=200

# mouse genome
index="/net/shendure/vol10/nobackup/genome/STAR/mm10"
gtf_file="/net/shendure/vol10/nobackup/genome/GTF/gencode.vM12.chr_patch_hapl_scaff.annotation.gtf.gz"

#define the mismatch rate for removing duplicates:
mismatch=1

#define the bin of python (python V2.7)
python_path="/net/shendure/vol12/projects/sciRNAseq_script/anaconda2/bin/"

#define the bin of R (R V3.6.3)
R_path="/net/gs/vol1/home/cxqiu/R/bin/"

#define the location of script:
script_path="/net/gs/vol1/home/cxqiu/work/scripts/JAX_rna/Jun_pipeline"
# define the location of the ligation barcodes
ligation_barcode=$script_path/lig_384_bc.pickle2
# define the location of the RT barcodes
RT_barcode=$script_path/RT_384_bc.pickle2
# define the location of the combined RT and ligation barcodes
barcodes=$script_path/combined_384_bc.txt
# define the location of the R script for multi-core processing
Jun_R_script=$script_path/sci3_bash_input_ID_output_core.R

now=$(date)
echo "Current time : $now"
module load modules modules-init modules-gs
module load samtools/1.9
module load bedtools/2.27.1
module load STAR/2.6.1d

UMI_attach_folder=$work_folder/UMI_attach
trimmed_fastq_folder=$work_folder/trimmed_fastq
STAR_alignment_folder=$work_folder/STAR_alignment
filtered_sam_folder=$work_folder/filtered_sam
rmdup_sam_folder=$work_folder/rmdup_sam
sam_splitted_folder=$work_folder/sam_splitted
report_folder=$work_folder/report

##########################
### Step 1: UMI attach ###
##########################
### ignoring I1 and I2 (PCR well information)
### adding R1 (barcode information) to R2 (mRNA)

echo "changing the name of the fastq files..."
for sample in $(cat $sample_ID); do
    echo changing name $sample
    mv $fastq_folder/*$sample*R1*.gz $fastq_folder/$sample.R1.fastq.gz
    mv $fastq_folder/*$sample*R2*.gz $fastq_folder/$sample.R2.fastq.gz
done

echo "Attaching barcode and UMI...."
mkdir -p $UMI_attach_folder
$python_path/python \
    $script_path/UMI_barcode_attach_gzipped_with_dic.py \
    $fastq_folder \
    $sample_ID \
    $UMI_attach_folder \
    $ligation_barcode \
    $RT_barcode \
    $core

echo ">>>Step 1 (UMI attach) has been done."


##################################
### Step 2: Trimming the read2 ###
##################################

echo
echo "Start trimming the read2 file..."

$R_path/Rscript \
    $Jun_R_script \
    $script_path/sci3_trim_beth.sh \
    $UMI_attach_folder \
    $sample_ID \
    $trimmed_fastq_folder \
    $core

echo ">>>Step 2 (trmming read2 file) has been done."


##############################
### Step 3: Aligning reads ###
##############################

echo
echo "Start aligning reads..."

mkdir -p $STAR_alignment_folder
STAR --genomeDir $index --genomeLoad Remove
for sample in $(cat $sample_ID); do 
    echo Aligning $sample
    STAR \
    --runThreadN $core \
    --outSAMstrandField intronMotif \
    --genomeDir $index \
    --readFilesCommand zcat \
    --readFilesIn $trimmed_fastq_folder/$sample*gz \
    --outFileNamePrefix $STAR_alignment_folder/$sample \
    --genomeLoad LoadAndKeep \
    --outReadsUnmapped Fastx 
done
STAR --genomeDir $index --genomeLoad Remove

echo ">>>Step 3 (aligning reads) has been done."


#############################
### Step 4: Filtering SAM ###
#############################

echo
echo "Start filter and sort the sam files..."

$R_path/Rscript \
    $Jun_R_script \
    $script_path/sci3_filter.sh \
    $STAR_alignment_folder \
    $sample_ID \
    $filtered_sam_folder \
    $core_sam

echo ">>>Step 4 (filtering sam) has been done."

### deleted the UMI_attach, trimmed reads, and STAR output
rm -rf $UMI_attach_folder/
rm -rf $trimmed_fastq_folder/
rm -rf $STAR_alignment_folder/

###################################
### Step 5: removing duplicates ###
###################################

echo
echo "Start removing duplicates..."
mkdir -p $rmdup_sam_folder
module unload python

$R_path/Rscript \
    $Jun_R_script \
    $script_path/sci3_rmdup_nomismatch_beth.sh \
    $filtered_sam_folder \
    $sample_ID \
    $rmdup_sam_folder \
    $core \
    $mismatch

mkdir -p $report_folder/duplicate_read
mv $rmdup_sam_folder/*.csv $report_folder/duplicate_read/

### calculate read number and estimate duplicate rate
for i in `ls $filtered_sam_folder/*.sam`; do 
    samtools view -c $i >> $work_folder/tmp1
done
for i in `ls $rmdup_sam_folder/*.sam`; do 
    samtools view -c $i >> $work_folder/tmp2
done
paste $work_folder/tmp1 $work_folder/tmp2 > $work_folder/read_num.txt
rm $work_folder/tmp1 $work_folder/tmp2

echo ">>>Step 5 (removing duplicates) has been done."

rm -rf $filtered_sam_folder/

cd $rmdup_sam_folder

for i in `ls *.sam`; do 
    samtools view -bS "$i" > `echo $i | sed 's/sam/bam/'`
done

for i in `ls *.bam`; do
    samtools index "$i"
done



