#from Jun

#!/bin/bash
# this scRNA-seq pipeline accept a input folder, and then use the default parameter for the data processing and analysis

# define the fastq folder including all fastq files
fastq_folder="/net/shendure/vol2/projects/cxqiu/nobackup/JAX_rna/run_15_E9_NovaSeq/nobackup/fastq"
#make sure not to put a slash on the end of this path:
all_output_folder="/net/shendure/vol2/projects/cxqiu/nobackup/JAX_rna/run_15_E9_NovaSeq/nobackup/output_4"

# define the PCR group id after demultiplexing - These are the sample ids that you gave from the sample sheet for demux
sample_ID="/net/shendure/vol2/projects/cxqiu/nobackup/JAX_rna/run_15_E9_NovaSeq/sample_ID_4.txt"

# define the core number for parallel processing
core=10
core_sam=3
# define the number of unique reads cutoff for splitting single cell
cutoff=200 # the number of unique reads cutoff for splitting single cell

# define the location for the index files used in STAR - remember what organism youre doing here
# drosophila: /net/shendure/vol12/projects/sciRNAseq_script/index/STAR_drosophila_BDGP6/
# mouse: /net/shendure/vol10/projects/scRNA/nobackup/reference/index/STAR/STAR_mm10_RNAseq/ #deleted
# newer mouse: /net/shendure/vol10/nobackup/genome/STAR/GRCm38-p6-all
index="/net/shendure/vol10/nobackup/genome/STAR/mm10"
# define the gtf file for gene counting
# drosophila: /net/shendure/vol12/projects/sciRNAseq_script/gtf_file/Drosophila_melanogaster.BDGP6.87.gtf.gz
# mouse: /net/shendure/vol1/home/martin91/nobackup/reference/mouse/gencode.vM22.chr_patch_hapl_scaff.annotation.gtf.gz
# newer mouse: /net/shendure/vol10/nobackup/genome/GTF/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz
gtf_file="/net/shendure/vol10/nobackup/genome/GTF/gencode.vM12.chr_patch_hapl_scaff.annotation.gtf.gz"

#define the mismatch rate for removing duplicates:
mismatch=1

#define the bin of python (python V2.7)
python_path="/net/shendure/vol12/projects/sciRNAseq_script/anaconda2/bin/"



#define the location of script:
script_path="/net/shendure/vol12/projects/sciRNAseq_script/sci3"
# Define the location of the script folder
script_folder="/net/shendure/vol12/projects/sciRNAseq_script/sci3/"
# define the location of the ligation barcodes
ligation_barcode=$script_folder/lig_384_bc.pickle2
# define the location of the RT barcodes
RT_barcode=$script_folder//RT_384_bc.pickle2
# define the location of the combined RT and ligation barcodes
barcodes=$script_folder//combined_384_bc.txt
# define the location of the R script for multi-core processing
R_script=$script_path/sci3_bash_input_ID_output_core.R


now=$(date)
echo "Current time : $now"
module load modules modules-init modules-gs
module load samtools/1.9
module load bedtools/2.27.1
module load STAR/2.6.1d


############ UMI attach
# this script take in a input folder, a sample ID, a output folder, a oligo-dT barcode file, a corresponding N5 barcode file, and
# it pass the factors to the python script
input_folder=$fastq_folder
output_folder=$all_output_folder/UMI_attach
#this script is different than the two-lvl one:
script=$script_path/UMI_barcode_attach_gzipped_with_dic.py


echo "changing the name of the fastq files..."
for sample in $(cat $sample_ID); do echo changing name $sample; mv $input_folder/*$sample*R1*.gz $input_folder/$sample.R1.fastq.gz; mv $input_folder/*$sample*R2*.gz $input_folder/$sample.R2.fastq.gz; done

echo "Attaching barcode and UMI...."
mkdir -p $output_folder
$python_path/python $script $input_folder $sample_ID $output_folder $ligation_barcode $RT_barcode $core
echo "Barcode transformed and UMI attached."

################# Trimming the read2
echo
echo "Start trimming the read2 file..."
echo $(date)



trimmed_fastq=$all_output_folder/trimmed_fastq
UMI_attached_R2=$all_output_folder/UMI_attach
#bash_script=$script_path/sci3_trim.sh

#change sci2_trim.sh path to an updated one in my scripts folder
/net/gs/vol1/home/cxqiu/R/bin/Rscript $R_script /net/shendure/vol1/home/martin91/scripts/sci3_trim.sh $UMI_attached_R2 $sample_ID $trimmed_fastq $core


############align the reads with STAR, filter the reads based on q > 30, and remove duplicates based on exactly UMI sequence and tagmentation site
#define the output folder
input_folder=$trimmed_fastq
STAR_output_folder=$all_output_folder/STAR_alignment
filtered_sam_folder=$all_output_folder/filtered_sam
rmdup_sam_folder=$all_output_folder/rmdup_sam



#align read2 to the index file using STAR with default setting # you will need to make sure your session has enough memory (40G): qlogin -l mfree=4G -pe serial 10
echo "Start alignment using STAR..."
echo input folder: $input_folder
echo sample ID file: $sample_ID
echo index file: $index
echo output_folder: $STAR_output_folder
#make the output folder
mkdir -p $STAR_output_folder
#remove the index from the memory #is this here just in case you have a previous index in memory? or is this a typo that it's here?
STAR --genomeDir $index --genomeLoad Remove
#start the alignment
for sample in $(cat $sample_ID); do echo Aligning $sample;STAR --runThreadN $core --outSAMstrandField intronMotif --genomeDir $index --readFilesCommand zcat --readFilesIn $input_folder/$sample*gz --outFileNamePrefix $STAR_output_folder/$sample --genomeLoad LoadAndKeep --outReadsUnmapped Fastx; done
#remove the index from the memory
STAR --genomeDir $index --genomeLoad Remove
echo "All alignment done."

#make the filter sam folder, and filter and sort the sam file 
#make the flltered sam folder
echo
echo "Start filter and sort the sam files..."
echo input folder: $STAR_output_folder
echo output folder: $filtered_sam_folder
module load samtools/1.9 #updated for Centos7

bash_script=$script_path/sci3_filter.sh #this script should be ok
/net/gs/vol1/home/cxqiu/R/bin/Rscript $R_script $bash_script $STAR_output_folder $sample_ID $filtered_sam_folder $core_sam

# make a folder for rmdup_sam_folder, 
# Then for each filtered sam file, remove the duplicates based on UMI and barcode, chromatin number and position
echo
echo "Start removing duplicates..."
echo input folder: $filtered_sam_folder
echo output folder: $rmdup_sam_folder
mkdir -p $rmdup_sam_folder
module unload python

#I had to change this to not look at Juns directory - might have to change for centos7?
bash_script=/net/shendure/vol1/home/martin91/scripts/sci3_rmdup_nomismatch.sh # for removing duplicates only considering exact match
##bash_script=$script_path/sci3_rmdup.sh
/net/gs/vol1/home/cxqiu/R/bin/Rscript $R_script $bash_script $filtered_sam_folder $sample_ID $rmdup_sam_folder $core $mismatch

#mv the reported files to the report/duplicate_read/ folder
mkdir -p $input_folder/../report/duplicate_read
mv $rmdup_sam_folder/*.csv $input_folder/../report/duplicate_read/
echo "removing duplicates completed.."
echo
echo "Alignment and sam file preprocessing are done."  



################# split the sam file based on the barcode, and mv the result to the report folder
sam_folder=$all_output_folder/rmdup_sam
output_folder=$all_output_folder/sam_splitted


echo
echo "Start splitting the sam file..."
echo samfile folder: $sam_folder
echo sample list: $sample_ID
echo output folder: $output_folder
echo barcode file: $barcodes
echo cutoff value: $cutoff
module unload python


#I had to change this to edit it so it wouldnt point to juns directory
# if youre using the barcode file with all 384 in it, but didnt use all 384 in the experiment,
# youre gonna see a lot of failed things fly by when this is running. I think its okay, but maybe streamline this
bash_script=/net/shendure/vol1/home/martin91/scripts/sci3_split.sh
/net/gs/vol1/home/cxqiu/R/bin/Rscript $R_script $bash_script $sam_folder $sample_ID $output_folder $core $barcodes $cutoff


cat $output_folder/*sample_list.txt>$output_folder/All_samples.txt
cp $output_folder/All_samples.txt $output_folder/../barcode_samples.txt
# output the report the report/barcode_read_distribution folder
mkdir -p $output_folder/../report/barcode_read_distribution
mv $output_folder/*.txt $output_folder/../report/barcode_read_distribution/
mv $output_folder/*.png $output_folder/../report/barcode_read_distribution/
echo
echo "All sam file splitted."


################# gene count
# count reads mapping to genes
output_folder=$all_output_folder/report/human_mouse_gene_count/
input_folder=$all_output_folder/sam_splitted
# the following script will be different for different organisms, because of the structure of the gtf file
# for mouse: script=$script_path/sciRNAseq_count.py
# for fly: script=$script_folder/sciRNAseq_count_dro.py
script=$script_path/sciRNAseq_count.py
sample_ID=$all_output_folder/barcode_samples.txt
echo "Start the gene count...."
$python_path/python $script $gtf_file $input_folder $sample_ID $core

echo "Make the output folder and transfer the files..."
mkdir -p $output_folder
find $input_folder -name *.count -exec cat {} + > $output_folder/count.MM
find $input_folder -name '*.count' | xargs rm -f
find $input_folder -name *.report -exec cat {} + > $output_folder/report.MM
find $input_folder -name '*.report' | xargs rm -f
mv $input_folder/*_annotate.txt $output_folder/
echo "All output files are transferred~"

################### calculate the reads number

# fastq_folder=$fastq_folder
# trimmed_folder=$all_output_folder/trimmed_fastq
# UMI_attach=$all_output_folder/UMI_attach
# alignment=$all_output_folder/STAR_alignment
# filtered_sam=$all_output_folder/filtered_sam
# rm_dup_sam=$all_output_folder/rmdup_sam_2
# #split_sam=$parental_folder/splited_sam
# report_folder=$all_output_folder/report/read_num
# echo
# echo "Start calculating the reads number..."
# #make the report folder
# mkdir -p $report_folder
# #calculate the read number and output the read number into the report folder
# echo sample,total reads,after filtering barcode,after trimming,uniquely aligned reads,After remove duplicates>$report_folder/read_number.csv
# for sample in $(cat $sample_ID); do echo calculating $sample; echo $sample,$(expr $(zcat $fastq_folder/$sample*R2*.gz|wc -l) / 4),$(expr $(zcat $UMI_attach/$sample*R2*.gz|wc -l) / 4),$(expr $(zcat $trimmed_folder/$sample*R2*.gz|wc -l) / 4),$(samtools view $filtered_sam/$sample.sam|wc -l),$(samtools view $rm_dup_sam/$sample.sam|wc -l)>>$report_folder/read_number.csv; done
# echo "Read number calculation is done."

# ################## calculate the mouse and human fraction
# input_folder=$all_output_folder/sam_splitted
# sample_ID=$all_output_folder/barcode_samples.txt
# output_folder=$all_output_folder/report/read_human_mouse
# echo 
# echo "Start calculating the mouse and human fraction..."
# mkdir -p $output_folder
# echo sample,human_reads,mouse_reads, cele_reads>$output_folder/human_mouse_fraction.txt
# for sample in $(cat $sample_ID); do echo Processing $sample; echo $sample,$(samtools view $input_folder/$sample.sam|grep 'chr' -v|wc -l),$(samtools view $input_folder/$sample.sam|grep 'chr'|grep 'cele' -v|wc -l),$(samtools view $input_folder/$sample.sam|grep 'cele'|wc -l)>>$output_folder/human_mouse_fraction.txt; done
# echo "Calculation done."

now=$(date)
echo "Current time : $now"
