#from Jun

#!/bin/bash
# this scRNA-seq pipeline accept a input folder, and then use the default parameter for the data processing and analysis

# define the fastq folder including all fastq files
fastq_folder="/net/shendure/vol10/projects/cxqiu/nobackup/JAX_rna/E85/nobackup/fastq"
#make sure not to put a slash on the end of this path:
all_output_folder="/net/shendure/vol10/projects/cxqiu/nobackup/JAX_rna/E85/nobackup/output_10"

# define the PCR group id after demultiplexing - These are the sample ids that you gave from the sample sheet for demux
sample_ID="/net/shendure/vol10/projects/cxqiu/nobackup/JAX_rna/E85/sample_ID_10.txt"

# define the core number for parallel processing
core=10
core_sam=3
# define the number of unique reads cutoff for splitting single cell
cutoff=200 # the number of unique reads cutoff for splitting single cell

# define the location for the index files used in STAR - remember what organism youre doing here
# drosophila: /net/shendure/vol12/projects/sciRNAseq_script/index/STAR_drosophila_BDGP6/
# mouse: /net/shendure/vol10/projects/scRNA/nobackup/reference/index/STAR/STAR_mm10_RNAseq/ #deleted
# newer mouse: /net/shendure/vol10/nobackup/genome/STAR/GRCm38-p6-all
index="/net/shendure/vol10/nobackup/genome/STAR/mm10"
# define the gtf file for gene counting
# drosophila: /net/shendure/vol12/projects/sciRNAseq_script/gtf_file/Drosophila_melanogaster.BDGP6.87.gtf.gz
# mouse: /net/shendure/vol1/home/martin91/nobackup/reference/mouse/gencode.vM22.chr_patch_hapl_scaff.annotation.gtf.gz
# newer mouse: /net/shendure/vol10/nobackup/genome/GTF/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz
gtf_file="/net/shendure/vol10/nobackup/genome/GTF/gencode.vM12.chr_patch_hapl_scaff.annotation.gtf.gz"

#define the mismatch rate for removing duplicates:
mismatch=1

#define the bin of python (python V2.7)
python_path="/net/shendure/vol12/projects/sciRNAseq_script/anaconda2/bin/"



#define the location of script:
script_path="/net/shendure/vol12/projects/sciRNAseq_script/sci3"
# Define the location of the script folder
script_folder="/net/shendure/vol12/projects/sciRNAseq_script/sci3/"
# define the location of the ligation barcodes
ligation_barcode=$script_folder/lig_384_bc.pickle2
# define the location of the RT barcodes
RT_barcode=$script_folder//RT_384_bc.pickle2
# define the location of the combined RT and ligation barcodes
barcodes=$script_folder//combined_384_bc.txt
# define the location of the R script for multi-core processing
R_script=$script_path/sci3_bash_input_ID_output_core.R


now=$(date)
echo "Current time : $now"
module load modules modules-init modules-gs
module load samtools/1.9
module load bedtools/2.27.1
module load STAR/2.6.1d




############ UMI attach
# this script take in a input folder, a sample ID, a output folder, a oligo-dT barcode file, a corresponding N5 barcode file, and
# it pass the factors to the python script
input_folder=$fastq_folder
output_folder=$all_output_folder/UMI_attach
#this script is different than the two-lvl one:
script=$script_path/UMI_barcode_attach_gzipped_with_dic.py


echo "changing the name of the fastq files..."
for sample in $(cat $sample_ID); do echo changing name $sample; mv $input_folder/*$sample*R1*.gz $input_folder/$sample.R1.fastq.gz; mv $input_folder/*$sample*R2*.gz $input_folder/$sample.R2.fastq.gz; done

echo "Attaching barcode and UMI...."
mkdir -p $output_folder
$python_path/python $script $input_folder $sample_ID $output_folder $ligation_barcode $RT_barcode $core
echo "Barcode transformed and UMI attached."

