Inputs ¶

There are Metamist wrappers built to get input sequencing groups.

You can import these from the cpg_flow package:

from cpg_flow.inputs import add_sg_to_dataset, get_multicohort, create_multicohort

cpg_flow.inputs.add_sg_to_dataset ¶

add_sg_to_dataset(dataset, sg_data)

Adds a sequencing group to a dataset.

PARAMETER	DESCRIPTION
`dataset`	Dataset to insert the SequencingGroup into TYPE: `Dataset`
`sg_data`	data from the metamist API TYPE: `dict`

RETURNS	DESCRIPTION
`SequencingGroup`	The SequencingGroup object

Source code in src/cpg_flow/inputs.py

def add_sg_to_dataset(dataset: Dataset, sg_data: dict) -> SequencingGroup:
    """
    Adds a sequencing group to a dataset.

    Args:
        dataset (Dataset): Dataset to insert the SequencingGroup into
        sg_data (dict): data from the metamist API

    Returns:
        The SequencingGroup object
    """
    # TODO: The update_dict calls are a bit of a hack, we should be able to do this in a cleaner way
    # scavenge all the metadata from the SG dict (SG/Sample/Participant)
    metadata = sg_data.get('meta', {})
    update_dict(metadata, sg_data['sample']['participant'].get('meta', {}))

    # phenotypes are managed badly here, need a cleaner way to get them into the SG
    update_dict(
        metadata,
        {'phenotypes': sg_data['sample']['participant'].get('phenotypes', {})},
    )

    # create a SequencingGroup object from its component parts
    sequencing_group = dataset.add_sequencing_group(
        id=str(sg_data['id']),
        external_id=str(sg_data['sample']['externalId']),
        participant_id=sg_data['sample']['participant'].get('externalId'),
        meta=metadata,
        sequencing_type=sg_data['type'],
        sequencing_technology=sg_data['technology'],
        sequencing_platform=sg_data['platform'],
    )

    if reported_sex := sg_data['sample']['participant'].get('reportedSex'):
        sequencing_group.pedigree.sex = Sex.parse(reported_sex)

    # parse the assays and related dict content
    _populate_alignment_inputs(sequencing_group, sg_data)

    return sequencing_group

cpg_flow.inputs.get_multicohort ¶

get_multicohort()

Return the cohort or multicohort object based on the workflow configuration.

Source code in src/cpg_flow/inputs.py

def get_multicohort() -> MultiCohort:
    """
    Return the cohort or multicohort object based on the workflow configuration.
    """
    input_datasets = config_retrieve(['workflow', 'input_datasets'], None)

    # pull the list of cohort IDs from the config
    custom_cohort_ids = config_retrieve(['workflow', 'input_cohorts'], None)

    if input_datasets:
        raise ValueError('Argument input_datasets is deprecated, use input_cohorts instead')

    if isinstance(custom_cohort_ids, list) and len(custom_cohort_ids) <= 0:
        raise ValueError('No custom_cohort_ids found in the config')

    # NOTE: When configuring sgs in the config is deprecated, this will be removed.
    if custom_cohort_ids and not isinstance(custom_cohort_ids, list):
        raise ValueError('Argument input_cohorts must be a list')

    # After the check for no cusotom_cohort_ids in the config convert
    # to a tuple for the cache decorator
    custom_cohort_ids = tuple() if not custom_cohort_ids else tuple(custom_cohort_ids)

    return create_multicohort(custom_cohort_ids)

cpg_flow.inputs.create_multicohort `cached` ¶

create_multicohort(custom_cohort_ids)

Add cohorts in the multicohort.

Source code in src/cpg_flow/inputs.py

@cache
def create_multicohort(custom_cohort_ids: tuple[str]) -> MultiCohort:
    """
    Add cohorts in the multicohort.
    """
    # get a unique set of cohort IDs
    custom_cohort_ids_unique = sorted(set(custom_cohort_ids))
    custom_cohort_ids_removed = sorted(set(custom_cohort_ids) - set(custom_cohort_ids_unique))

    # if any cohort id duplicates were removed we log them
    if len(custom_cohort_ids_unique) != len(custom_cohort_ids):
        logger.warning(
            f'Removed {len(custom_cohort_ids_removed)} non-unique cohort IDs',
        )
        duplicated_cohort_ids = ', '.join(custom_cohort_ids_removed)
        logger.warning(f'Non-unique cohort IDs: {duplicated_cohort_ids}')

    multicohort = MultiCohort()

    # for each Cohort ID
    for cohort_id in custom_cohort_ids_unique:
        # get the dictionary representation of all SGs in this cohort
        # dataset_id is sequencing_group_dict['sample']['project']['name']
        cohort_sg_dict = get_cohort_sgs(cohort_id)
        cohort_name = cohort_sg_dict.get('name', cohort_id)
        cohort_dataset = cohort_sg_dict.get('dataset', None)
        cohort_sgs = cohort_sg_dict.get('sequencing_groups', [])

        if len(cohort_sgs) == 0:
            raise MetamistError(f'Cohort {cohort_id} has no sequencing groups')

        # create a new Cohort object
        cohort = multicohort.create_cohort(
            id=cohort_id,
            name=cohort_name,
            dataset=cohort_dataset,
        )

        # first populate these SGs into their Datasets
        # required so that the SG objects can be referenced in the collective Datasets
        # SG.dataset.prefix is meaningful, to correctly store outputs in the project location
        for entry in cohort_sgs:
            sg_dataset = entry['sample']['project']['name']
            dataset = multicohort.create_dataset(sg_dataset.removesuffix('-test'))

            sequencing_group = add_sg_to_dataset(dataset, entry)

            # also add the same sequencing group to the cohort
            cohort.add_sequencing_group_object(sequencing_group)

    # we've populated all the sequencing groups in the cohorts and datasets
    # all SequencingGroup objects should be populated uniquely (pointers to instances, so updating Analysis entries
    # for each SG should update both the Dataset's version and the Cohort's version)

    # only go to metamist once per dataset to get analysis entries
    for dataset in multicohort.get_datasets():
        _populate_analysis(dataset)
        if config_retrieve(['workflow', 'read_pedigree'], True):
            _populate_pedigree(dataset)

    return multicohort

Inputs¶

cpg_flow.inputs.add_sg_to_dataset ¶

cpg_flow.inputs.get_multicohort ¶

cpg_flow.inputs.create_multicohort cached ¶

Inputs ¶

cpg_flow.inputs.create_multicohort `cached` ¶