#!/bin/bash
#SBATCH -J weur
#SBATCH --account=rogersa-np
#SBATCH --partition=rogersa-np
#SBATCH --time=36:00:00
#SBATCH --nodes 1
#SBATCH --ntasks 1
#SBATCH -o weur.out # stdout
#SBATCH -e weur.err # stderr
# Build raf file for western europeans
module --latest load bcftools
# Determine lexical sort order.
export LC_ALL=C
# Output name
target=weur
# Reference genome
ref=$HOME/grp1/rogers/data/hg19/hg19-sort.fa.gz
# Comma-separated list of autosomes
autosomes=$(seq 1 22 | xargs echo | tr " " ",")
# Base dir name
base=$HOME/grp1/rogers/data/simons/WestEurasia
# An array of names of input files
infiles=($(ls ${base}/english/*.vcf.gz ${base}/french/*.vcf.gz))
# Write input files to stderr. The sed command puts them on
# separate lines, indented.
echo "input:" ${infiles[@]} | sed 's/ /\n /g' >&2
# Merge inputs into a single vcf.
bcftools merge ${infiles[@]} --regions $autosomes -Ou |
# Combine records of different types into a single line. Use reference
# genome to normalize indels.
bcftools norm --multiallelics +any --fasta-ref $ref -Ou |
# Limit the data to SNPs and fixed sites. This will remove sites that
# contain indels. The --SnpGap bit also removes sites within 7 bases
# of an indel.
bcftools filter --include 'TYPE="snp" || TYPE="ref"' --SnpGap 7 -Ou |
# Set low-quality genotypes to "."
bcftools filter --set-GTs . --exclude 'FMT/FL = "0" | FMT/FL = "N"' -Ou |
# No more than 2 alleles (but retain sites with 1 allele)
bcftools view --max-alleles 2 -Ou |
# Output the columns that are required by raf
bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' |
# Sort chromosomes lexically. -s retains sort of POS w/i chromosomes.
# Sort order is governed by LC_ALL, which is set at the top of this
# file.
sort -s -k1,1 |
# Make raf.gz file
raf | gzip > ${target}.raf.gz