Skip to content

Commit

Permalink
continuous tensor maps made from big query (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
meganshand committed Apr 29, 2019
1 parent 9522ea7 commit f715140
Show file tree
Hide file tree
Showing 3 changed files with 782 additions and 468 deletions.
5 changes: 5 additions & 0 deletions ml4cvd/defines.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,8 @@
56: 'x', 57: 'y', 58: 'z'}

TENSOR_MAPS_FILE_NAME = 'tensor_maps_by_script'

#BigQuery tables
DICTIONARY_TABLE = "ukbb7089_201904.dictionary"
CODING_TABLE = "ukbb7089_201904.coding"
PHENOTYPE_TABLE = "ukbb7089_201904.phenotype"
118 changes: 59 additions & 59 deletions ml4cvd/tensor_map_maker.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
import os
import logging
import numpy as np
import pandas as pd
from typing import List
from typing.io import TextIO
from DatabaseClient import BigQueryDatabaseClient, DatabaseClient

from defines import MRI_ZOOM_INPUT, MRI_ZOOM_MASK, TENSOR_MAPS_FILE_NAME, MRI_SEGMENTED_CHANNEL_MAP
from defines import MRI_ZOOM_INPUT, MRI_ZOOM_MASK, TENSOR_MAPS_FILE_NAME, MRI_SEGMENTED_CHANNEL_MAP, DICTIONARY_TABLE, CODING_TABLE, PHENOTYPE_TABLE
from tensor_writer_ukbb import disease_prevalence_status, get_disease2tsv, disease_incidence_status, disease_censor_status


def write_tensor_maps(args) -> None:
logging.info("Making tensor maps...")

tensor_maps_file = f"{args.output_folder}/{TENSOR_MAPS_FILE_NAME}.py"
db_client = BigQueryDatabaseClient(credentials_file=args.bigquery_credentials_file)
with open(tensor_maps_file, 'w') as f:
f.write(_get_tensor_map_file_imports())
_write_dynamic_mri_tensor_maps(args.x, args.y, args.z, args.zoom_width, args.zoom_height, args.label_weights, args.t, f)
# Commented out until BigQuery version is implemented
#_write_megans_tensor_maps(f)
_write_continuous_tensor_maps(f, db_client)
_write_disease_tensor_maps(args.phenos_folder, f)
_write_disease_tensor_maps_time(args.phenos_folder, f)
_write_disease_tensor_maps_incident_prevalent(args.phenos_folder, f)
Expand Down Expand Up @@ -146,63 +145,64 @@ def _write_disease_tensor_maps_time(phenos_folder: str, f: TextIO) -> None:
f.write(f"TMAPS['{d}_time']=TensorMap('{d}',group='diagnosis_time',channel_map={{'{d}_time':0}},loss='mse')\n")


def _write_megans_tensor_maps(f: TextIO):
def _write_continuous_tensor_maps(f: TextIO, db_client: DatabaseClient):
annotation_units = 2
count = 0

pyukbb_data_path = '/mnt/disks/data/raw/pyukbb_data/'
to_be_exported_csv_path = '/mnt/disks/data/raw/pyukbb_data/mb-ukbb-selected-fields.csv'
available_fields_pd = pd.read_csv(to_be_exported_csv_path)
fields = _get_all_available_fields(available_fields_pd)

continuous_field_ids = fields.loc[fields['ValueType']=='Continuous']['FieldID']
integer_field_ids = fields.loc[fields['ValueType']=='Integer']['FieldID']
f.write(f"\n\n# Continuous tensor maps from pyukbb\n")
for field_id in continuous_field_ids.append(integer_field_ids):
print(field_id)
group = 'continuous'
pf = pyukbb.UKBioBankParsedField.from_file(_get_pkl_path_for_field(field_id, pyukbb_data_path))
name = str(field_id) + "_" + pf.field.replace("-", "").replace(" ", "-").replace("(", "").replace(")", "")
group = 'continuous'

# Handle special coding values in continuous variables in order to generate summary statistics (mean and std dev) for
# each field across all samples. This will remove missing samples from the calculation and change the value of 'Less than one'
query = f"""
WITH coding_tmp AS (
SELECT
*,
CASE
WHEN meaning IN ('Do not know', 'Prefer not to answer', 'Ongoing when data entered') OR meaning LIKE "Still taking%" THEN TRUE
END AS missing,
CASE
WHEN meaning = 'Less than one' THEN '.5'
END AS value
FROM
{CODING_TABLE}
), pheno_tmp AS (
SELECT
sample_id,
FieldID,
COALESCE(c.value, p.value) new_value,
COALESCE(c.missing, FALSE) missing
FROM {PHENOTYPE_TABLE} AS p
LEFT JOIN coding_tmp AS c
ON TRUE
AND p.value = c.coding
AND p.coding_file_id = c.coding_file_id
WHERE TRUE
AND instance = 0
AND array_idx = 0
)
SELECT
t.FieldID,
Field,
AVG(CAST(new_value AS FLOAT64)) mean,
STDDEV(CAST(new_value AS FLOAT64)) std
FROM pheno_tmp AS t
LEFT JOIN {DICTIONARY_TABLE} AS d ON d.FieldID = t.FieldID
WHERE TRUE
AND ValueType IN ('Integer', 'Continuous')
AND NOT missing
GROUP BY t.FieldID, Field
"""

field_data_for_tensor_maps = db_client.execute(query)

f.write(f"\n\n# Continuous tensor maps\n")
for row in field_data_for_tensor_maps:
name = str(row.FieldID) + "_" + row.Field.replace("-", "").replace(" ", "-").replace("(", "").replace(")", "")
name = name.replace("'", "").replace(",", "").replace("/", "").replace("+", "") + "_0_0"
try:
tensor = pyukbb.utils.get_dense_tensor_for_sample_ids(pf, list(pf.included_ukbb_sample_ids))
except IndexError:
print(name + " could not be tensorized.")
continue

if pf.has_coding:
# group = 'continuous_with_categorical'
print("pf has coding")

df = pd.DataFrame({'sample': list(pf.included_ukbb_sample_ids)})
for k, v in pf.category_coding_map.items():
df[v] = tensor[:, 0, 0, k]
if 'Do not know' in df.columns and 'Prefer not to answer' in df.columns:
df['all_missing'] = df['Do not know'] + df['Prefer not to answer'] + df[
'Not available in UKBB database']
# -313 is "ongoing"
elif 'Ongoing when data entered' in df.columns:
df['all_missing'] = df['Ongoing when data entered'] + df['Not available in UKBB database']
else:
df['all_missing'] = df['Not available in UKBB database']

if 'Less than one' in df.columns:
df['true_value'] = df['Less than one'].apply(lambda x: .5 if x == 1 else 0)
df['true_value'] = df['true_value'] + df['Value']
else:
df['true_value'] = df['Value']

mean = np.mean(df.loc[df['all_missing'] == 0]['true_value'])
std = np.std(df.loc[df['all_missing'] == 0]['true_value'])

if mean is np.nan:
logging.warning(name + " had nans")
continue

f.write(f"TMAPS['{field_id}_0'] = TensorMap('{name}', group='{group}', channel_map={{'{name}': 0, "
f"'not-missing': 1}}, normalization={{'mean': {mean}, 'std': {std}}}, "

f.write(f"TMAPS['{row.FieldID}_0'] = TensorMap('{name}', group='{group}', channel_map={{'{name}': 0, "
f"'not-missing': 1}}, normalization={{'mean': {row.mean}, 'std': {row.std}}}, "
f"annotation_units={annotation_units})\n")
count += 1



def _segmented_map(name):
Expand Down
Loading

0 comments on commit f715140

Please sign in to comment.