Source code for aoutools.prs._config
"""
This module defines a configuration class for Polygenic Risk Score (PRS)
calculation.
"""
from dataclasses import dataclass
from typing import Optional, Union, Sequence
import hail as hl
[docs]
@dataclass
class PRSConfig:
# pylint: disable=too-many-instance-attributes
"""
A configuration class for Polygenic Risk Score (PRS) calculation.
Attributes
----------
chunk_size : int, default 20000
The number of variants to include in each processing chunk.
samples_to_keep : Union[hl.Table, Sequence[str], Sequence[int], str, int], optional
A collection of sample IDs to keep. Accepts a Hail Table, or a Python
list, set, tuple of strings or integers, or a single string or integer.
If None, all samples are retained.
weight_col_name : str, default 'weight'
The column name in weights table that contains effect sizes or weights.
log_transform_weight : bool, default False
If True, applies a natural log transformation to the weight column.
Useful when weights are odds ratios (OR), since PRS assumes additive
effects on the log-odds scale.
include_n_matched : bool, default False
If True, adds a column 'n_matched' with the number of variants matched
between weights table and VDS. This option has a performance cost and
should be used only when necessary.
sample_id_col : str, default 'person_id'
The column name to use for sample IDs in the final output table.
split_multi : bool, default True
If True, splits multi-allelic variants in VDS into bi-allelic variants
prior to calculation.
ref_is_effect_allele : bool, default False
If True, assumes effect allele in weights file corresponds to reference
allele in VDS. Used only when `split_multi` is True.
strict_allele_match : bool, default True
Used only when `split_multi` is False. If True, enforces that one
allele in weights table matches reference allele in VDS and other
allele is a valid alternate. If False, only effect allele is checked to
correspond to either reference or alternate allele, and other allele is
not verified.
detailed_timings : bool, default False
If True, logs timing information for each major step. Helpful for
diagnosing performance issues.
"""
chunk_size: int = 20000
samples_to_keep: Optional[
Union[
hl.Table,
Sequence[str],
Sequence[int],
str,
int
]
] = None
weight_col_name: str = 'weight'
log_transform_weight: bool = False
include_n_matched: bool = False
sample_id_col: str = 'person_id'
split_multi: bool = True
ref_is_effect_allele: bool = False
strict_allele_match: bool = True
detailed_timings: bool = False