#!/usr/bin/env python3
import logging
import settings
from peewee import (BigIntegerField,
BlobField,
BooleanField,
CharField,
DateTimeField,
IntegerField,
Field,
FloatField,
ForeignKeyField,
Model,
PostgresqlDatabase,
PrimaryKeyField,
SQL,
TextField,
fn,
)
from playhouse.postgres_ext import ArrayField, BinaryJSONField, PostgresqlExtDatabase
database = PostgresqlExtDatabase(settings.psql_name,
user = settings.psql_user,
password = settings.psql_pass,
host = settings.psql_host,
port = settings.psql_port,
register_hstore = False)
[docs]class BaseModel(Model):
class Meta:
database = database
[docs]class EnumField(Field):
db_field = 'string' # The same as for CharField
def __init__(self, choices=None, *args, **kwargs):
self.values = choices or []
super().__init__(*args, **kwargs)
[docs] def db_value(self, value):
if value not in self.values:
raise ValueError("Illegal value for '{}'".format(self.db_column))
return value
[docs] def python_value(self, value):
if value not in self.values:
raise ValueError("Illegal value for '{}'".format(self.db_column))
return value
###
# Reference Tables
##
[docs]class ReferenceSet(BaseModel):
"""
The gencode, ensembl, dbNSFP and omim data are combined to fill out the
Gene, Transcript and Feature tables. DbSNP data is separate, and may be
shared between reference sets, so it uses a foreign key instead.
"""
class Meta:
table_name = 'reference_sets'
schema = 'data'
name = CharField(column_name="reference_name", null=True)
ensembl_version = CharField()
gencode_version = CharField()
dbnsfp_version = CharField()
reference_build = CharField(unique=True)
[docs]class Gene(BaseModel):
class Meta:
table_name = 'genes'
schema = 'data'
reference_set = ForeignKeyField(ReferenceSet, column_name="reference_set", backref="genes")
gene_id = CharField(unique=True, max_length=15)
name = CharField(column_name="gene_name", null=True)
full_name = CharField(null=True)
canonical_transcript = CharField(null=True, max_length=15)
chrom = CharField(max_length=10)
start = IntegerField(column_name="start_pos")
stop = IntegerField(column_name="end_pos")
strand = EnumField(choices=['+','-'])
[docs]class GeneOtherNames(BaseModel):
class Meta:
table_name = 'gene_other_names'
schema = 'data'
gene = ForeignKeyField(Gene, column_name="gene", backref="other_names")
name = CharField(null=True)
[docs]class Transcript(BaseModel):
class Meta:
table_name = 'transcripts'
schema = 'data'
transcript_id = CharField(max_length=15)
gene = ForeignKeyField(Gene, column_name="gene", backref="transcripts")
mim_gene_accession = IntegerField()
mim_annotation = CharField()
chrom = CharField(max_length=10)
start = IntegerField(column_name="start_pos")
stop = IntegerField(column_name="stop_pos")
strand = EnumField(choices = ['+', '-'])
[docs]class Feature(BaseModel):
class Meta:
table_name = 'features'
schema = 'data'
gene = ForeignKeyField(Gene, column_name="gene", backref='exons')
transcript = ForeignKeyField(Transcript, column_name="transcript", backref='transcripts')
chrom = CharField(max_length=10)
start = IntegerField(column_name="start_pos")
stop = IntegerField(column_name="stop_pos")
strand = EnumField(choices = ['+', '-'])
feature_type = CharField()
###
# Study and Dataset fields
##
[docs]class Collection(BaseModel):
"""
A collection is a source of data which can be sampled into a SampleSet.
"""
class Meta:
table_name = 'collections'
schema = 'data'
name = CharField(column_name="study_name", null = True)
ethnicity = CharField(null = True)
[docs]class Study(BaseModel):
"""
A study is a scientific study with a PI and a description, and may include
one or more datasets.
"""
class Meta:
table_name = 'studies'
schema = 'data'
pi_name = CharField()
pi_email = CharField()
contact_name = CharField()
contact_email = CharField()
title = CharField()
description = TextField(column_name="study_description", null=True)
publication_date = DateTimeField()
ref_doi = CharField(null=True)
[docs]class Dataset(BaseModel):
"""
A dataset is part of a study, and usually include a certain population.
Most studies only have a single dataset, but multiple are allowed.
"""
class Meta:
table_name = 'datasets'
schema = 'data'
study = ForeignKeyField(Study, column_name="study", backref='datasets')
short_name = CharField()
full_name = CharField()
browser_uri = CharField(null=True)
beacon_uri = CharField(null=True)
description = TextField(column_name="beacon_description", null=True)
avg_seq_depth = FloatField(null=True)
seq_type = CharField(null=True)
seq_tech = CharField(null=True)
seq_center = CharField(null=True)
dataset_size = IntegerField()
[docs] def has_image(self):
try:
DatasetLogo.get(DatasetLogo.dataset == self)
return True
except DatasetLogo.DoesNotExist:
return False
[docs]class SampleSet(BaseModel):
class Meta:
table_name = 'sample_sets'
schema = 'data'
dataset = ForeignKeyField(Dataset, column_name="dataset", backref='sample_sets')
collection = ForeignKeyField(Collection, column_name="collection", backref='sample_sets')
sample_size = IntegerField()
phenotype = CharField(null=True)
[docs]class DatasetVersion(BaseModel):
class Meta:
table_name = 'dataset_versions'
schema = 'data'
dataset = ForeignKeyField(Dataset, column_name="dataset", backref='versions')
reference_set = ForeignKeyField(ReferenceSet, column_name="reference_set", backref='dataset_versions')
version = CharField(column_name="dataset_version")
description = TextField(column_name="dataset_description")
terms = TextField()
available_from = DateTimeField()
ref_doi = CharField(null=True)
data_contact_name = CharField(null=True)
data_contact_link = CharField(null=True)
num_variants = IntegerField(null=True)
coverage_levels = ArrayField(IntegerField, null=True)
portal_avail = BooleanField(null=True)
file_access = EnumField(null=False, choices=['PRIVATE', 'CONTROLLED', 'REGISTERED', 'PUBLIC'])
beacon_access = EnumField(null=False, choices=['PRIVATE', 'CONTROLLED', 'REGISTERED', 'PUBLIC'])
[docs]class DatasetFile(BaseModel):
class Meta:
table_name = 'dataset_files'
schema = 'data'
dataset_version = ForeignKeyField(DatasetVersion, column_name="dataset_version", backref='files')
name = CharField(column_name="basename")
uri = CharField()
file_size = IntegerField()
[docs]class DatasetLogo(BaseModel):
class Meta:
table_name = 'dataset_logos'
schema = 'data'
dataset = ForeignKeyField(Dataset, column_name="dataset", backref='logo')
mimetype = CharField()
data = BlobField(column_name="bytes")
###
# Variant and coverage data fields
##
[docs]class Variant(BaseModel):
class Meta:
table_name = "variants"
schema = 'data'
dataset_version = ForeignKeyField(DatasetVersion, column_name="dataset_version", backref="variants")
rsid = IntegerField()
chrom = CharField(max_length=10)
pos = IntegerField()
ref = CharField()
alt = CharField()
site_quality = FloatField()
orig_alt_alleles = ArrayField(CharField)
hom_count = IntegerField()
allele_freq = FloatField()
filter_string = CharField()
variant_id = CharField()
allele_count = IntegerField()
allele_num = IntegerField()
quality_metrics = BinaryJSONField()
vep_annotations = BinaryJSONField()
[docs]class VariantMate(BaseModel):
class Meta:
table_name = "mate"
schema = 'data'
dataset_version = ForeignKeyField(DatasetVersion, column_name="dataset_version", backref="mate")
chrom = CharField(max_length=10)
pos = IntegerField()
ref = CharField()
alt = CharField()
chrom_id = CharField()
mate_chrom = CharField()
mate_start = IntegerField()
mate_id = CharField()
allele_freq = FloatField()
variant_id = CharField()
allele_count = IntegerField()
allele_num = IntegerField()
[docs]class VariantGenes(BaseModel):
class Meta:
table_name = 'variant_genes'
schema = 'data'
variant = ForeignKeyField(Variant, column_name="variant", backref="genes")
gene = ForeignKeyField(Gene, column_name="gene", backref="variants")
[docs]class VariantTranscripts(BaseModel):
class Meta:
table_name = 'variant_transcripts'
schema = 'data'
variant = ForeignKeyField(Variant, column_name="variant", backref="transcripts")
transcript = ForeignKeyField(Transcript, column_name="transcript", backref="variants")
[docs]class Coverage(BaseModel):
"""
Coverage statistics are pre-calculated for each variant for a given
dataset.
The fields show the fraction of a population that reaches the
mapping coverages given by the variable names.
ex. cov20 = 0.994 means that 99.4% of the population had at a mapping
coverage of at least 20 in this position.
"""
class Meta:
table_name = "coverage"
schema = 'data'
dataset_version = ForeignKeyField(DatasetVersion, column_name="dataset_version")
chrom = CharField(max_length=10)
pos = IntegerField()
mean = FloatField()
median = FloatField()
coverage = ArrayField(FloatField, null=True)
[docs]class Metrics(BaseModel):
class Meta:
table_name = "metrics"
schema = 'data'
dataset_version = ForeignKeyField(DatasetVersion, column_name="dataset_version")
metric = CharField()
mids = ArrayField(IntegerField)
hist = ArrayField(IntegerField)
[docs]class User(BaseModel):
class Meta:
table_name = "users"
schema = 'users'
name = CharField(column_name="username", null=True)
email = CharField(unique=True)
identity = CharField(unique=True)
identity_type = EnumField(null=False, choices=['google', 'elixir'], default='elixir')
affiliation = CharField(null=True)
country = CharField(null=True)
[docs] def is_admin(self, dataset):
return DatasetAccess.select().where(
DatasetAccess.dataset == dataset,
DatasetAccess.user == self,
DatasetAccess.is_admin
).count()
[docs] def has_access(self, dataset, ds_version=None):
"""
Check whether user has permission to access a dataset
Args:
dataset (Database): peewee Database object
ds_version (str): the dataset version
Returns:
bool: allowed to access
"""
dsv = get_dataset_version(dataset.short_name, ds_version)
if not dsv:
return False
if dsv.file_access in ('REGISTERED', 'PUBLIC'):
return True
elif dsv.file_access == 'PRIVATE':
return False
return (DatasetAccessCurrent.select()
.where(DatasetAccessCurrent.dataset == dataset,
DatasetAccessCurrent.user == self)
.count()) > 0
[docs] def has_requested_access(self, dataset):
return DatasetAccessPending.select().where(
DatasetAccessPending.dataset == dataset,
DatasetAccessPending.user == self
).count()
[docs]class SFTPUser(BaseModel):
class Meta:
table_name = "sftp_users"
schema = 'users'
user = ForeignKeyField(User, backref='sftp_user')
user_uid = IntegerField(unique=True)
user_name = CharField(null=False)
password_hash = CharField(null=False)
account_expires = DateTimeField(null=False)
[docs]class UserAccessLog(BaseModel):
class Meta:
table_name = "user_access_log"
schema = 'users'
user = ForeignKeyField(User, backref='access_logs')
dataset = ForeignKeyField(Dataset, column_name='dataset', backref='access_logs')
action = EnumField(null=True, choices=['access_granted','access_revoked','access_requested','private_link'])
ts = DateTimeField()
[docs]class UserConsentLog(BaseModel):
class Meta:
table_name = "user_consent_log"
schema = 'users'
user = ForeignKeyField(User, backref='consent_logs')
dataset_version = ForeignKeyField(DatasetVersion, column_name='dataset_version', backref='consent_logs')
ts = DateTimeField()
[docs]class UserDownloadLog(BaseModel):
class Meta:
table_name = "user_download_log"
schema = 'users'
user = ForeignKeyField(User, backref='download_logs')
dataset_file = ForeignKeyField(DatasetFile, column_name='dataset_file', backref='download_logs')
ts = DateTimeField()
[docs]class DatasetAccess(BaseModel):
class Meta:
table_name = "dataset_access"
schema = 'users'
dataset = ForeignKeyField(Dataset, column_name='dataset', backref='access')
user = ForeignKeyField(User, backref='dataset_access')
wants_newsletter = BooleanField(null=True)
is_admin = BooleanField(null=True)
[docs]class Linkhash(BaseModel):
class Meta:
table_name = "linkhash"
schema = 'users'
dataset_version = ForeignKeyField(DatasetVersion, column_name='dataset_version', backref='link_hashes')
user = ForeignKeyField(User, backref='link_hashes')
hash = CharField()
expires_on = DateTimeField()
[docs]class BeaconCounts(BaseModel):
class Meta:
table_name = "beacon_dataset_counts_table"
schema = 'beacon'
datasetid = CharField(primary_key=True)
callcount = IntegerField()
variantcount = IntegerField()
#####
# Views
##
[docs]class DatasetVersionCurrent(DatasetVersion):
class Meta:
table_name = 'dataset_version_current'
schema = 'data'
dataset = ForeignKeyField(Dataset, column_name="dataset", backref='current_version')
reference_set = ForeignKeyField(ReferenceSet, column_name="reference_set", backref='current_version')
[docs]class DatasetAccessCurrent(DatasetAccess):
class Meta:
table_name = 'dataset_access_current'
schema = 'users'
dataset = ForeignKeyField(Dataset, column_name='dataset', backref='access_current')
user = ForeignKeyField(User, backref='access_current')
has_access = IntegerField()
access_requested = DateTimeField()
[docs]class DatasetAccessPending(DatasetAccess):
class Meta:
table_name = 'dataset_access_pending'
schema = 'users'
dataset = ForeignKeyField(Dataset, column_name='dataset', backref='access_pending')
user = ForeignKeyField(User, backref='access_pending')
has_access = IntegerField()
access_requested = DateTimeField()
#####
# Help functions
##
[docs]def get_next_free_uid():
"""
Get the next free uid >= 10000 and > than the current uids
from the sftp_user table in the db.
Returns:
int: the next free uid
"""
default = 10000
next_uid = default
try:
current_max_uid = SFTPUser.select(fn.MAX(SFTPUser.user_uid)).get().user_uid
if current_max_uid:
next_uid = current_max_uid+1
except SFTPUser.DoesNotExist:
pass
return next_uid
[docs]def get_admin_datasets(user):
"""
Get a list of datasets where user is admin
Args:
user (User): Peewee User object for the user of interest
Returns:
DataSetAccess:
"""
return DatasetAccess.select().where( DatasetAccess.user == user, DatasetAccess.is_admin)
[docs]def get_dataset(dataset:str):
"""
Given dataset name get Dataset
Args:
dataset (str): short name of the dataset
Returns:
Dataset: the corresponding DatasetVersion entry
"""
dataset = Dataset.select().where( Dataset.short_name == dataset).get()
return dataset
[docs]def get_dataset_version(dataset:str, version:str=None):
"""
Given dataset get DatasetVersion
Args:
dataset (str): short name of the dataset
Returns:
DatasetVersion: the corresponding DatasetVersion entry
"""
if version:
try:
dataset_version = (DatasetVersion
.select(DatasetVersion, Dataset)
.join(Dataset)
.where(DatasetVersion.version == version,
Dataset.short_name == dataset)).get()
except DatasetVersion.DoesNotExist:
logging.error("get_dataset_version({}, {}): ".format(dataset, version) +
"cannot retrieve dataset version")
return
else:
try:
dataset_version = (DatasetVersionCurrent
.select(DatasetVersionCurrent, Dataset)
.join(Dataset)
.where(Dataset.short_name == dataset)).get()
except DatasetVersionCurrent.DoesNotExist:
logging.error("get_dataset_version({}, version=None): ".format(dataset) +
"cannot retrieve dataset version")
return
return dataset_version
[docs]def build_dict_from_row(row):
d = {}
for field, value in row.__dict__['__data__'].items():
if field == "id":
continue
d[field] = value
return d