-- function
SET search_path TO mimiciv_derived;
CREATE OR REPLACE FUNCTION REGEXP_EXTRACT(str TEXT, pattern TEXT) RETURNS TEXT AS $$
BEGIN
RETURN substring(str from pattern);
END; $$
LANGUAGE PLPGSQL;
CREATE OR REPLACE FUNCTION REGEXP_CONTAINS(str TEXT, pattern TEXT) RETURNS BOOL AS $$
BEGIN
RETURN str ~ pattern;
END; $$
LANGUAGE PLPGSQL;
-- alias generate_series with generate_array
CREATE OR REPLACE FUNCTION GENERATE_ARRAY(i INTEGER, j INTEGER)
RETURNS setof INTEGER language sql as $$
SELECT GENERATE_SERIES(i, j)
$$;
-- datetime functions
CREATE OR REPLACE FUNCTION DATETIME(dt DATE) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN CAST(dt AS TIMESTAMP(3));
END; $$
LANGUAGE PLPGSQL;
CREATE OR REPLACE FUNCTION DATETIME(year INTEGER, month INTEGER, day INTEGER, hour INTEGER, minute INTEGER, second INTEGER) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN TO_TIMESTAMP(
TO_CHAR(year, '0000') || TO_CHAR(month, '00') || TO_CHAR(day, '00') || TO_CHAR(hour, '00') || TO_CHAR(minute, '00') || TO_CHAR(second, '00'),
'yyyymmddHH24MISS'
);
END; $$
LANGUAGE PLPGSQL;
-- overload allowing string input
-- DATETIME_ADD(datetime, INTERVAL 'n' DATEPART) -> datetime + INTERVAL 'n' DATEPART
-- note: in bigquery, `INTERVAL 1 YEAR` is a valid interval
-- but in postgres, it must be `INTERVAL '1' YEAR`
CREATE OR REPLACE FUNCTION DATETIME_ADD(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN datetime_val + intvl;
END; $$
LANGUAGE PLPGSQL;
-- DATETIME_SUB(datetime, INTERVAL 'n' DATEPART) -> datetime - INTERVAL 'n' DATEPART
CREATE OR REPLACE FUNCTION DATETIME_SUB(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN datetime_val - intvl;
END; $$
LANGUAGE PLPGSQL;
-- TODO:
-- DATETIME_TRUNC(datetime, PART) -> DATE_TRUNC('datepart', datetime)
-- below requires a regex to convert datepart from primitive to a string
-- i.e. encapsulate it in single quotes
CREATE OR REPLACE FUNCTION DATETIME_DIFF(endtime TIMESTAMP(3), starttime TIMESTAMP(3), datepart TEXT) RETURNS NUMERIC AS $$
BEGIN
RETURN
EXTRACT(EPOCH FROM endtime - starttime) /
CASE
WHEN datepart = 'SECOND' THEN 1.0
WHEN datepart = 'MINUTE' THEN 60.0
WHEN datepart = 'HOUR' THEN 3600.0
WHEN datepart = 'DAY' THEN 24*3600.0
WHEN datepart = 'YEAR' THEN 365.242*24*3600.0
ELSE NULL END;
END; $$
LANGUAGE PLPGSQL;
-- BigQuery has a custom data type, PART
-- It's difficult to replicate this in postgresql, which recognizes the PART as a column name,
-- unless it is within an EXTRACT() function.
CREATE OR REPLACE FUNCTION BIGQUERY_FORMAT_TO_PSQL(format_str VARCHAR(255)) RETURNS TEXT AS $$
BEGIN
RETURN
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
REPLACE(
REPLACE(
REPLACE(
REPLACE(
REPLACE(
REPLACE(
format_str
, '%S', 'SS'
)
, '%M', 'MI'
)
, '%H', 'HH24'
)
, '%d', 'dd'
)
, '%m', 'mm'
)
, '%Y', 'yyyy'
)
;
END; $$
LANGUAGE PLPGSQL;
CREATE OR REPLACE FUNCTION FORMAT_DATE(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$
BEGIN
RETURN TO_CHAR(
datetime_val,
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
BIGQUERY_FORMAT_TO_PSQL(format_str)
);
END; $$
LANGUAGE PLPGSQL;
CREATE OR REPLACE FUNCTION PARSE_DATE(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS DATE AS $$
BEGIN
RETURN TO_DATE(
string_val,
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
BIGQUERY_FORMAT_TO_PSQL(format_str)
);
END; $$
LANGUAGE PLPGSQL;
CREATE OR REPLACE FUNCTION FORMAT_DATETIME(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$
BEGIN
RETURN TO_CHAR(
datetime_val,
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
BIGQUERY_FORMAT_TO_PSQL(format_str)
);
END; $$
LANGUAGE PLPGSQL;
CREATE OR REPLACE FUNCTION PARSE_DATETIME(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN TO_TIMESTAMP(
string_val,
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
BIGQUERY_FORMAT_TO_PSQL(format_str)
);
END; $$
LANGUAGE PLPGSQL;
age
-- age
create TABLE if not exists age as
SELECT
ad.subject_id
, ad.hadm_id
, ad.admittime
, pa.anchor_age
, pa.anchor_year
, DATETIME_DIFF(ad.admittime, DATETIME(pa.anchor_year, 1, 1, 0, 0, 0), 'YEAR') + pa.anchor_age AS age
FROM mimiciv_hosp.admissions ad
INNER JOIN mimiciv_hosp.patients pa
ON ad.subject_id = pa.subject_id;
icustay_detail
create TABLE if not exists icustay_detail as
SELECT ie.subject_id, ie.hadm_id, ie.stay_id
-- patient level factors
, pat.gender, pat.dod
-- hospital level factors
, adm.admittime, adm.dischtime
, DATETIME_DIFF(adm.dischtime, adm.admittime, 'DAY') as los_hospital
, DATETIME_DIFF(adm.admittime, DATETIME(pat.anchor_year, 1, 1, 0, 0, 0), 'YEAR') + pat.anchor_age as admission_age
, adm.race
, adm.hospital_expire_flag
, DENSE_RANK() OVER (PARTITION BY adm.subject_id ORDER BY adm.admittime) AS hospstay_seq
, CASE
WHEN DENSE_RANK() OVER (PARTITION BY adm.subject_id ORDER BY adm.admittime) = 1 THEN True
ELSE False END AS first_hosp_stay
-- icu level factors
, ie.intime as icu_intime, ie.outtime as icu_outtime
, ROUND(DATETIME_DIFF(ie.outtime, ie.intime, 'HOUR')/24.0, 2) as los_icu
, DENSE_RANK() OVER (PARTITION BY ie.hadm_id ORDER BY ie.intime) AS icustay_seq
-- first ICU stay *for the current hospitalization*
, CASE
WHEN DENSE_RANK() OVER (PARTITION BY ie.hadm_id ORDER BY ie.intime) = 1 THEN True
ELSE False END AS first_icu_stay
FROM mimiciv_icu.icustays ie
INNER JOIN mimiciv_hosp.admissions adm
ON ie.hadm_id = adm.hadm_id
INNER JOIN mimiciv_hosp.patients pat
ON ie.subject_id = pat.subject_id;
icustay_times
-- 3. icustay_times
create TABLE if not exists icustay_times as
WITH t1 AS
(
select ce.stay_id
, min(charttime) as intime_hr
, max(charttime) as outtime_hr
FROM mimiciv_icu.chartevents ce
-- only look at heart rate
where ce.itemid = 220045
group by ce.stay_id
)
-- add in subject_id/hadm_id
select
ie.subject_id, ie.hadm_id, ie.stay_id
, t1.intime_hr
, t1.outtime_hr
FROM mimiciv_icu.icustays ie
left join t1
on ie.stay_id = t1.stay_id;
icustay_hourly
-- 4. icustay_hourly
-- 它的主要目的是为每位患者计算一个时间范围(从进入 ICU 前 24 小时到离开 ICU),并生成一个包含这些小时数的数组。
create TABLE if not exists icustay_hourly as
with all_hours as
(
select
it.stay_id
-- ceiling the intime to the nearest hour by adding 59 minutes then truncating
-- note thart we truncate by parsing as string, rather than using DATETIME_TRUNC
-- this is done to enable compatibility with psql
, PARSE_DATETIME(
'%Y-%m-%d %H:00:00',
FORMAT_DATETIME(
'%Y-%m-%d %H:00:00',
DATETIME_ADD(it.intime_hr, INTERVAL '59' MINUTE)
)) AS endtime
-- create integers for each charttime in hours from admission
-- so 0 is admission time, 1 is one hour after admission, etc, up to ICU disch
-- we allow 24 hours before ICU admission (to grab labs before admit)
, GENERATE_ARRAY(-24, CEIL(DATETIME_DIFF(it.outtime_hr, it.intime_hr, 'HOUR'))::INTEGER) as hrs
--hrs 是一个数组,包含从进入 ICU 前 24 小时到离开 ICU 的每个小时数。
from icustay_times it
)
SELECT stay_id
, CAST(hr AS BIGINT) as hr
, DATETIME_ADD(endtime, interval '1 HOUR' * (CAST(hr AS bigint))) as endtime
FROM all_hours
CROSS JOIN UNNEST(array[all_hours.hrs]) AS hr;
weight_durations
-- 5. weight_durations
create TABLE if not exists weight_durations as
WITH wt_stg as
(
SELECT
c.stay_id
, c.charttime
, case when c.itemid = 226512 then 'admit'
else 'daily' end as weight_type
-- TODO: eliminate obvious outliers if there is a reasonable weight
, c.valuenum as weight
FROM mimiciv_icu.chartevents c
WHERE c.valuenum IS NOT NULL
AND c.itemid in
(
226512 -- Admit Wt
, 224639 -- Daily Weight
)
AND c.valuenum > 0
)
-- assign ascending row number
, wt_stg1 as
(
select
stay_id
, charttime
, weight_type
, weight
, ROW_NUMBER() OVER (partition by stay_id, weight_type order by charttime) as rn
from wt_stg
WHERE weight IS NOT NULL
)
-- change charttime to intime for the first admission weight recorded
, wt_stg2 AS
(
SELECT
wt_stg1.stay_id
, ie.intime, ie.outtime
, wt_stg1.weight_type
, case when wt_stg1.weight_type = 'admit' and wt_stg1.rn = 1
then DATETIME_SUB(ie.intime, INTERVAL '2' HOUR)
else wt_stg1.charttime end as starttime
, wt_stg1.weight
from wt_stg1
INNER JOIN mimiciv_icu.icustays ie
on ie.stay_id = wt_stg1.stay_id
)
, wt_stg3 as
(
select
stay_id
, intime, outtime
, starttime
, coalesce(
LEAD(starttime) OVER (PARTITION BY stay_id ORDER BY starttime),
DATETIME_ADD(outtime, INTERVAL '2' HOUR)
) as endtime
, weight
, weight_type
from wt_stg2
)
-- this table is the start/stop times from admit/daily weight in charted data
, wt1 as
(
select
stay_id
, starttime
, coalesce(endtime,
LEAD(starttime) OVER (partition by stay_id order by starttime),
-- impute ICU discharge as the end of the final weight measurement
-- plus a 2 hour "fuzziness" window
DATETIME_ADD(outtime, INTERVAL '2' HOUR)
) as endtime
, weight
, weight_type
from wt_stg3
)
-- if the intime for the patient is < the first charted daily weight
-- then we will have a "gap" at the start of their stay
-- to prevent this, we look for these gaps and backfill the first weight
-- this adds (153255-149657)=3598 rows, meaning this fix helps for up to 3598 stay_id
, wt_fix as
(
select ie.stay_id
-- we add a 2 hour "fuzziness" window
, DATETIME_SUB(ie.intime, INTERVAL '2' HOUR) as starttime
, wt.starttime as endtime
, wt.weight
, wt.weight_type
from mimiciv_icu.icustays ie
inner join
-- the below subquery returns one row for each unique stay_id
-- the row contains: the first starttime and the corresponding weight
(
SELECT wt1.stay_id, wt1.starttime, wt1.weight
, weight_type
, ROW_NUMBER() OVER (PARTITION BY wt1.stay_id ORDER BY wt1.starttime) as rn
FROM wt1
) wt
ON ie.stay_id = wt.stay_id
AND wt.rn = 1
and ie.intime < wt.starttime
)
-- add the backfill rows to the main weight table
SELECT
wt1.stay_id
, wt1.starttime
, wt1.endtime
, wt1.weight
, wt1.weight_type
FROM wt1
UNION ALL
SELECT
wt_fix.stay_id
, wt_fix.starttime
, wt_fix.endtime
, wt_fix.weight
, wt_fix.weight_type
FROM wt_fix;
bg
-- 6. bg
-- The aim of this query is to pivot entries related to blood gases
-- which were found in LABEVENTS
create TABLE if not exists bg as
WITH bg AS
(
select
-- specimen_id only ever has 1 measurement for each itemid
-- so, we may simply collapse rows using MAX()
MAX(subject_id) AS subject_id
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
-- specimen_id *may* have different storetimes, so this is taking the latest
, MAX(storetime) AS storetime
, le.specimen_id
, MAX(CASE WHEN itemid = 52028 THEN value ELSE NULL END) AS specimen
, MAX(CASE WHEN itemid = 50801 THEN valuenum ELSE NULL END) AS aado2
, MAX(CASE WHEN itemid = 50802 THEN valuenum ELSE NULL END) AS baseexcess
, MAX(CASE WHEN itemid = 50803 THEN valuenum ELSE NULL END) AS bicarbonate
, MAX(CASE WHEN itemid = 50804 THEN valuenum ELSE NULL END) AS totalco2
, MAX(CASE WHEN itemid = 50805 THEN valuenum ELSE NULL END) AS carboxyhemoglobin
, MAX(CASE WHEN itemid = 50806 THEN valuenum ELSE NULL END) AS chloride
, MAX(CASE WHEN itemid = 50808 THEN valuenum ELSE NULL END) AS calcium
, MAX(CASE WHEN itemid = 50809 and valuenum <= 10000 THEN valuenum ELSE NULL END) AS glucose
, MAX(CASE WHEN itemid = 50810 and valuenum <= 100 THEN valuenum ELSE NULL END) AS hematocrit
, MAX(CASE WHEN itemid = 50811 THEN valuenum ELSE NULL END) AS hemoglobin
, MAX(CASE WHEN itemid = 50813 and valuenum <= 10000 THEN valuenum ELSE NULL END) AS lactate
, MAX(CASE WHEN itemid = 50814 THEN valuenum ELSE NULL END) AS methemoglobin
, MAX(CASE WHEN itemid = 50815 THEN valuenum ELSE NULL END) AS o2flow
-- fix a common unit conversion error for fio2
-- atmospheric o2 is 20.89%, so any value <= 20 is unphysiologic
-- usually this is a misplaced O2 flow measurement
, MAX(CASE WHEN itemid = 50816 THEN
CASE
WHEN valuenum > 20 AND valuenum <= 100 THEN valuenum
WHEN valuenum > 0.2 AND valuenum <= 1.0 THEN valuenum*100.0
ELSE NULL END
ELSE NULL END) AS fio2
, MAX(CASE WHEN itemid = 50817 AND valuenum <= 100 THEN valuenum ELSE NULL END) AS so2
, MAX(CASE WHEN itemid = 50818 THEN valuenum ELSE NULL END) AS pco2
, MAX(CASE WHEN itemid = 50819 THEN valuenum ELSE NULL END) AS peep
, MAX(CASE WHEN itemid = 50820 THEN valuenum ELSE NULL END) AS ph
, MAX(CASE WHEN itemid = 50821 THEN valuenum ELSE NULL END) AS po2
, MAX(CASE WHEN itemid = 50822 THEN valuenum ELSE NULL END) AS potassium
, MAX(CASE WHEN itemid = 50823 THEN valuenum ELSE NULL END) AS requiredo2
, MAX(CASE WHEN itemid = 50824 THEN valuenum ELSE NULL END) AS sodium
, MAX(CASE WHEN itemid = 50825 THEN valuenum ELSE NULL END) AS temperature
, MAX(CASE WHEN itemid = 50807 THEN value ELSE NULL END) AS comments
FROM mimiciv_hosp.labevents le
where le.ITEMID in
-- blood gases
(
52028 -- specimen
, 50801 -- aado2
, 50802 -- base excess
, 50803 -- bicarb
, 50804 -- calc tot co2
, 50805 -- carboxyhgb
, 50806 -- chloride
-- , 52390 -- chloride, WB CL-
, 50807 -- comments
, 50808 -- free calcium
, 50809 -- glucose
, 50810 -- hct
, 50811 -- hgb
, 50813 -- lactate
, 50814 -- methemoglobin
, 50815 -- o2 flow
, 50816 -- fio2
, 50817 -- o2 sat
, 50818 -- pco2
, 50819 -- peep
, 50820 -- pH
, 50821 -- pO2
, 50822 -- potassium
-- , 52408 -- potassium, WB K+
, 50823 -- required O2
, 50824 -- sodium
-- , 52411 -- sodium, WB NA +
, 50825 -- temperature
)
GROUP BY le.specimen_id
)
, stg_spo2 as
(
select subject_id, charttime
-- avg here is just used to group SpO2 by charttime
, AVG(valuenum) as SpO2
FROM mimiciv_icu.chartevents
where ITEMID = 220277 -- O2 saturation pulseoxymetry
and valuenum > 0 and valuenum <= 100
group by subject_id, charttime
)
, stg_fio2 as
(
select subject_id, charttime
-- pre-process the FiO2s to ensure they are between 21-100%
, max(
case
when valuenum > 0.2 and valuenum <= 1
then valuenum * 100
-- improperly input data - looks like O2 flow in litres
when valuenum > 1 and valuenum < 20
then null
when valuenum >= 20 and valuenum <= 100
then valuenum
else null end
) as fio2_chartevents
FROM mimiciv_icu.chartevents
where ITEMID = 223835 -- Inspired O2 Fraction (FiO2)
and valuenum > 0 and valuenum <= 100
group by subject_id, charttime
)
, stg2 as
(
select bg.*
, ROW_NUMBER() OVER (partition by bg.subject_id, bg.charttime order by s1.charttime DESC) as lastRowSpO2
, s1.spo2
from bg
left join stg_spo2 s1
-- same hospitalization
on bg.subject_id = s1.subject_id
-- spo2 occurred at most 2 hours before this blood gas
and s1.charttime between DATETIME_SUB(bg.charttime, INTERVAL '2'HOUR) and bg.charttime
where bg.po2 is not null
)
, stg3 as
(
select bg.*
, ROW_NUMBER() OVER (partition by bg.subject_id, bg.charttime order by s2.charttime DESC) as lastRowFiO2
, s2.fio2_chartevents
-- create our specimen prediction
, 1/(1+exp(-(-0.02544
+ 0.04598 * po2
+ coalesce(-0.15356 * spo2 , -0.15356 * 97.49420 + 0.13429)
+ coalesce( 0.00621 * fio2_chartevents , 0.00621 * 51.49550 + -0.24958)
+ coalesce( 0.10559 * hemoglobin , 0.10559 * 10.32307 + 0.05954)
+ coalesce( 0.13251 * so2 , 0.13251 * 93.66539 + -0.23172)
+ coalesce(-0.01511 * pco2 , -0.01511 * 42.08866 + -0.01630)
+ coalesce( 0.01480 * fio2 , 0.01480 * 63.97836 + -0.31142)
+ coalesce(-0.00200 * aado2 , -0.00200 * 442.21186 + -0.01328)
+ coalesce(-0.03220 * bicarbonate , -0.03220 * 22.96894 + -0.06535)
+ coalesce( 0.05384 * totalco2 , 0.05384 * 24.72632 + -0.01405)
+ coalesce( 0.08202 * lactate , 0.08202 * 3.06436 + 0.06038)
+ coalesce( 0.10956 * ph , 0.10956 * 7.36233 + -0.00617)
+ coalesce( 0.00848 * o2flow , 0.00848 * 7.59362 + -0.35803)
))) as specimen_prob
from stg2 bg
left join stg_fio2 s2
-- same patient
on bg.subject_id = s2.subject_id
-- fio2 occurred at most 4 hours before this blood gas
and s2.charttime between DATETIME_SUB(bg.charttime, INTERVAL '4' HOUR) and bg.charttime
AND s2.fio2_chartevents > 0
where bg.lastRowSpO2 = 1 -- only the row with the most recent SpO2 (if no SpO2 found lastRowSpO2 = 1)
)
select
stg3.subject_id
, stg3.hadm_id
, stg3.charttime
-- raw data indicating sample type
, specimen
-- prediction of specimen for obs missing the actual specimen
, case
when specimen is not null then specimen
when specimen_prob > 0.75 then 'ART.'
else null end as specimen_pred
, specimen_prob
-- oxygen related parameters
, so2
, po2
, pco2
, fio2_chartevents, fio2
, aado2
-- also calculate AADO2
, case
when po2 is null
OR pco2 is null
THEN NULL
WHEN fio2 IS NOT NULL
-- multiple by 100 because fio2 is in a % but should be a fraction
THEN (fio2/100) * (760 - 47) - (pco2/0.8) - po2
WHEN fio2_chartevents IS NOT NULL
THEN (fio2_chartevents/100) * (760 - 47) - (pco2/0.8) - po2
else null
end as aado2_calc
, case
when PO2 is null
THEN NULL
WHEN fio2 IS NOT NULL
-- multiply by 100 because fio2 is in a % but should be a fraction
then 100 * PO2/fio2
WHEN fio2_chartevents IS NOT NULL
-- multiply by 100 because fio2 is in a % but should be a fraction
then 100 * PO2/fio2_chartevents
else null
end as pao2fio2ratio
-- acid-base parameters
, ph, baseexcess
, bicarbonate, totalco2
-- blood count parameters
, hematocrit
, hemoglobin
, carboxyhemoglobin
, methemoglobin
-- chemistry
, chloride, calcium
, temperature
, potassium, sodium
, lactate
, glucose
-- ventilation stuff that's sometimes input
-- , intubated, tidalvolume, ventilationrate, ventilator
-- , peep, o2flow
-- , requiredo2
from stg3
where lastRowFiO2 = 1 -- only the most recent FiO2
;
blood_differential
-- 7. blood_differential
-- For reference, some common unit conversions:
-- 10^9/L == K/uL == 10^3/uL
create TABLE if not exists blood_differential as
WITH blood_diff AS
(
SELECT
MAX(subject_id) AS subject_id
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
, le.specimen_id
-- create one set of columns for percentages, and one set of columns for counts
-- we harmonize all count units into K/uL == 10^9/L
-- counts have an "_abs" suffix, percentages do not
-- absolute counts
, MAX(CASE WHEN itemid in (51300, 51301, 51755) THEN valuenum ELSE NULL END) AS wbc
, MAX(CASE WHEN itemid = 52069 THEN valuenum ELSE NULL END) AS basophils_abs
-- 52073 in K/uL, 51199 in #/uL
, MAX(CASE WHEN itemid = 52073 THEN valuenum WHEN itemid = 51199 THEN valuenum / 1000.0 ELSE NULL END) AS eosinophils_abs
-- 51133 in K/uL, 52769 in #/uL
, MAX(CASE WHEN itemid = 51133 THEN valuenum WHEN itemid = 52769 THEN valuenum / 1000.0 ELSE NULL END) AS lymphocytes_abs
-- 52074 in K/uL, 51253 in #/uL
, MAX(CASE WHEN itemid = 52074 THEN valuenum WHEN itemid = 51253 THEN valuenum / 1000.0 ELSE NULL END) AS monocytes_abs
, MAX(CASE WHEN itemid = 52075 THEN valuenum ELSE NULL END) AS neutrophils_abs
-- convert from #/uL to K/uL
, MAX(CASE WHEN itemid = 51218 THEN valuenum / 1000.0 ELSE NULL END) AS granulocytes_abs
-- percentages, equal to cell count / white blood cell count
, MAX(CASE WHEN itemid = 51146 THEN valuenum ELSE NULL END) AS basophils
, MAX(CASE WHEN itemid = 51200 THEN valuenum ELSE NULL END) AS eosinophils
, MAX(CASE WHEN itemid in (51244, 51245) THEN valuenum ELSE NULL END) AS lymphocytes
, MAX(CASE WHEN itemid = 51254 THEN valuenum ELSE NULL END) AS monocytes
, MAX(CASE WHEN itemid = 51256 THEN valuenum ELSE NULL END) AS neutrophils
-- other cell count percentages
, MAX(CASE WHEN itemid = 51143 THEN valuenum ELSE NULL END) AS atypical_lymphocytes
, MAX(CASE WHEN itemid = 51144 THEN valuenum ELSE NULL END) AS bands
, MAX(CASE WHEN itemid = 52135 THEN valuenum ELSE NULL END) AS immature_granulocytes
, MAX(CASE WHEN itemid = 51251 THEN valuenum ELSE NULL END) AS metamyelocytes
, MAX(CASE WHEN itemid = 51257 THEN valuenum ELSE NULL END) AS nrbc
-- utility flags which determine whether imputation is possible
, CASE
-- WBC is available
WHEN MAX(CASE WHEN itemid in (51300, 51301, 51755) THEN valuenum ELSE NULL END) > 0
-- and we have at least one percentage from the diff
-- sometimes the entire diff is 0%, which looks like bad data
AND SUM(CASE WHEN itemid IN (51146, 51200, 51244, 51245, 51254, 51256) THEN valuenum ELSE NULL END) > 0
THEN 1 ELSE 0 END AS impute_abs
FROM mimiciv_hosp.labevents le
WHERE le.itemid IN
(
51146, -- basophils
52069, -- Absolute basophil count
51199, -- Eosinophil Count
51200, -- Eosinophils
52073, -- Absolute Eosinophil count
51244, -- Lymphocytes
51245, -- Lymphocytes, Percent
51133, -- Absolute Lymphocyte Count
52769, -- Absolute Lymphocyte Count
51253, -- Monocyte Count
51254, -- Monocytes
52074, -- Absolute Monocyte Count
51256, -- Neutrophils
52075, -- Absolute Neutrophil Count
51143, -- Atypical lymphocytes
51144, -- Bands (%)
51218, -- Granulocyte Count
52135, -- Immature granulocytes (%)
51251, -- Metamyelocytes
51257, -- Nucleated Red Cells
-- wbc totals measured in K/uL
51300, 51301, 51755
-- 52220 (wbcp) is percentage
-- below are point of care tests which are extremely infrequent and usually low quality
-- 51697, -- Neutrophils (mmol/L)
-- below itemid do not have data as of MIMIC-IV v1.0
-- 51536, -- Absolute Lymphocyte Count
-- 51537, -- Absolute Neutrophil
-- 51690, -- Lymphocytes
-- 52151, -- NRBC
)
AND valuenum IS NOT NULL
-- differential values cannot be negative
AND valuenum >= 0
GROUP BY le.specimen_id
)
SELECT
subject_id, hadm_id, charttime, specimen_id
, wbc
-- impute absolute count if percentage & WBC is available
, ROUND((CASE
WHEN basophils_abs IS NULL AND basophils IS NOT NULL AND impute_abs = 1
THEN basophils * wbc
ELSE basophils_abs
END)::numeric, 4) AS basophils_abs
, ROUND(CASE
WHEN eosinophils_abs IS NULL AND eosinophils IS NOT NULL AND impute_abs = 1
THEN eosinophils * wbc
ELSE eosinophils_abs
END::numeric, 4) AS eosinophils_abs
, ROUND(CASE
WHEN lymphocytes_abs IS NULL AND lymphocytes IS NOT NULL AND impute_abs = 1
THEN lymphocytes * wbc
ELSE lymphocytes_abs
END::numeric, 4) AS lymphocytes_abs
, ROUND((CASE
WHEN monocytes_abs IS NULL AND monocytes IS NOT NULL AND impute_abs = 1
THEN monocytes * wbc
ELSE monocytes_abs
END)::numeric, 4) AS monocytes_abs
, ROUND((CASE
WHEN neutrophils_abs IS NULL AND neutrophils IS NOT NULL AND impute_abs = 1
THEN neutrophils * wbc
ELSE neutrophils_abs
END)::numeric, 4) AS neutrophils_abs
, basophils
, eosinophils
, lymphocytes
, monocytes
, neutrophils
-- impute bands/blasts?
, atypical_lymphocytes
, bands
, immature_granulocytes
, metamyelocytes
, nrbc
FROM blood_diff
;
cardiac_marker
-- 8. cardiac_marker
-- begin query that extracts the data
create TABLE if not exists cardiac_marker as
SELECT
MAX(subject_id) AS subject_id
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
, le.specimen_id
-- convert from itemid into a meaningful column
, MAX(CASE WHEN itemid = 51002 THEN value ELSE NULL END) AS troponin_i
, MAX(CASE WHEN itemid = 51003 THEN value ELSE NULL END) AS troponin_t
, MAX(CASE WHEN itemid = 50911 THEN valuenum ELSE NULL END) AS ck_mb
FROM mimiciv_hosp.labevents le
WHERE le.itemid IN
(
-- 51002, -- Troponin I (troponin-I is not measured in MIMIC-IV)
-- 52598, -- Troponin I, point of care, rare/poor quality
51003, -- Troponin T
50911 -- Creatinine Kinase, MB isoenzyme
)
GROUP BY le.specimen_id
;
chemistry
-- 9.chemistry
-- extract chemistry labs
-- excludes point of care tests (very rare)
-- blood gas measurements are *not* included in this query
-- instead they are in bg.sql
create TABLE if not exists chemistry as
SELECT
MAX(subject_id) AS subject_id
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
, le.specimen_id
-- convert from itemid into a meaningful column
, MAX(CASE WHEN itemid = 50862 AND valuenum <= 10 THEN valuenum ELSE NULL END) AS albumin
, MAX(CASE WHEN itemid = 50930 AND valuenum <= 10 THEN valuenum ELSE NULL END) AS globulin
, MAX(CASE WHEN itemid = 50976 AND valuenum <= 20 THEN valuenum ELSE NULL END) AS total_protein
, MAX(CASE WHEN itemid = 50868 AND valuenum <= 10000 THEN valuenum ELSE NULL END) AS aniongap
, MAX(CASE WHEN itemid = 50882 AND valuenum <= 10000 THEN valuenum ELSE NULL END) AS bicarbonate
, MAX(CASE WHEN itemid = 51006 AND valuenum <= 300 THEN valuenum ELSE NULL END) AS bun
, MAX(CASE WHEN itemid = 50893 AND valuenum <= 10000 THEN valuenum ELSE NULL END) AS calcium
, MAX(CASE WHEN itemid = 50902 AND valuenum <= 10000 THEN valuenum ELSE NULL END) AS chloride
, MAX(CASE WHEN itemid = 50912 AND valuenum <= 150 THEN valuenum ELSE NULL END) AS creatinine
, MAX(CASE WHEN itemid = 50931 AND valuenum <= 10000 THEN valuenum ELSE NULL END) AS glucose
, MAX(CASE WHEN itemid = 50983 AND valuenum <= 200 THEN valuenum ELSE NULL END) AS sodium
, MAX(CASE WHEN itemid = 50971 AND valuenum <= 30 THEN valuenum ELSE NULL END) AS potassium
FROM mimiciv_hosp.labevents le
WHERE le.itemid IN
(
-- comment is: LABEL | CATEGORY | FLUID | NUMBER OF ROWS IN LABEVENTS
50862, -- ALBUMIN | CHEMISTRY | BLOOD | 146697
50930, -- Globulin
50976, -- Total protein
50868, -- ANION GAP | CHEMISTRY | BLOOD | 769895
-- 52456, -- Anion gap, point of care test
50882, -- BICARBONATE | CHEMISTRY | BLOOD | 780733
50893, -- Calcium
50912, -- CREATININE | CHEMISTRY | BLOOD | 797476
-- 52502, Creatinine, point of care
50902, -- CHLORIDE | CHEMISTRY | BLOOD | 795568
50931, -- GLUCOSE | CHEMISTRY | BLOOD | 748981
-- 52525, Glucose, point of care
50971, -- POTASSIUM | CHEMISTRY | BLOOD | 845825
-- 52566, -- Potassium, point of care
50983, -- SODIUM | CHEMISTRY | BLOOD | 808489
-- 52579, -- Sodium, point of care
51006 -- UREA NITROGEN | CHEMISTRY | BLOOD | 791925
-- 52603, Urea, point of care
)
AND valuenum IS NOT NULL
-- lab values cannot be 0 and cannot be negative
-- .. except anion gap.
AND (valuenum > 0 OR itemid = 50868)
GROUP BY le.specimen_id
;
coagulation
-- 10.coagulation
create TABLE if not exists coagulation as
SELECT
MAX(subject_id) AS subject_id
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
, le.specimen_id
-- convert from itemid into a meaningful column
, MAX(CASE WHEN itemid = 51196 THEN valuenum ELSE NULL END) AS d_dimer
, MAX(CASE WHEN itemid = 51214 THEN valuenum ELSE NULL END) AS fibrinogen
, MAX(CASE WHEN itemid = 51297 THEN valuenum ELSE NULL END) AS thrombin
, MAX(CASE WHEN itemid = 51237 THEN valuenum ELSE NULL END) AS inr
, MAX(CASE WHEN itemid = 51274 THEN valuenum ELSE NULL END) AS pt
, MAX(CASE WHEN itemid = 51275 THEN valuenum ELSE NULL END) AS ptt
FROM mimiciv_hosp.labevents le
WHERE le.itemid IN
(
-- 51149, 52750, 52072, 52073 -- Bleeding Time, no data as of MIMIC-IV v0.4
51196, -- D-Dimer
51214, -- Fibrinogen
-- 51280, 52893, -- Reptilase Time, no data as of MIMIC-IV v0.4
-- 51281, 52161, -- Reptilase Time Control, no data as of MIMIC-IV v0.4
51297, -- thrombin
51237, -- INR
51274, -- PT
51275 -- PTT
)
AND valuenum IS NOT NULL
GROUP BY le.specimen_id
;
complete_blood_count
-- 11. complete_blood_count
-- begin query that extracts the data
create TABLE if not exists complete_blood_count as
SELECT
MAX(subject_id) AS subject_id
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
, le.specimen_id
-- convert from itemid into a meaningful column
, MAX(CASE WHEN itemid = 51221 THEN valuenum ELSE NULL END) AS hematocrit
, MAX(CASE WHEN itemid = 51222 THEN valuenum ELSE NULL END) AS hemoglobin
, MAX(CASE WHEN itemid = 51248 THEN valuenum ELSE NULL END) AS mch
, MAX(CASE WHEN itemid = 51249 THEN valuenum ELSE NULL END) AS mchc
, MAX(CASE WHEN itemid = 51250 THEN valuenum ELSE NULL END) AS mcv
, MAX(CASE WHEN itemid = 51265 THEN valuenum ELSE NULL END) AS platelet
, MAX(CASE WHEN itemid = 51279 THEN valuenum ELSE NULL END) AS rbc
, MAX(CASE WHEN itemid = 51277 THEN valuenum ELSE NULL END) AS rdw
, MAX(CASE WHEN itemid = 52159 THEN valuenum ELSE NULL END) AS rdwsd
, MAX(CASE WHEN itemid = 51301 THEN valuenum ELSE NULL END) AS wbc
FROM mimiciv_hosp.labevents le
WHERE le.itemid IN
(
51221, -- hematocrit
51222, -- hemoglobin
51248, -- MCH
51249, -- MCHC
51250, -- MCV
51265, -- platelets
51279, -- RBC
51277, -- RDW
52159, -- RDW SD
51301 -- WBC
)
AND valuenum IS NOT NULL
-- lab values cannot be 0 and cannot be negative
AND valuenum > 0
GROUP BY le.specimen_id
;
creatinine_baseline
-- 12.creatinine_baseline
-- This query extracts the serum creatinine baselines of adult patients on each hospital admission.
-- The baseline is determined by the following rules:
-- i. if the lowest creatinine value during this admission is normal (<1.1), then use the value
-- ii. if the patient is diagnosed with chronic kidney disease (CKD), then use the lowest creatinine value during the admission, although it may be rather large.
-- iii. Otherwise, we estimate the baseline using the Simplified MDRD Formula:
-- eGFR = 186 × Scr^(-1.154) × Age^(-0.203) × 0.742Female
-- Let eGFR = 75. Scr = [ 75 / 186 / Age^(-0.203) / (0.742Female) ] ^ (1/-1.154)
create TABLE if not exists creatinine_baseline as
WITH p as
(
SELECT
ag.subject_id
, ag.hadm_id
, ag.age
, p.gender
, CASE WHEN p.gender='F' THEN
POWER(75.0 / 186.0 / POWER(ag.age, -0.203) / 0.742, -1/1.154)
ELSE
POWER(75.0 / 186.0 / POWER(ag.age, -0.203), -1/1.154)
END
AS MDRD_est --估算的肾小球滤过率
FROM age ag
LEFT JOIN mimiciv_hosp.patients p
ON ag.subject_id = p.subject_id
WHERE ag.age >= 18
)
, lab as
(
SELECT
hadm_id
, MIN(creatinine) AS scr_min
FROM chemistry
GROUP BY hadm_id
)
, ckd as
(
SELECT hadm_id, MAX(1) AS CKD_flag
FROM mimiciv_hosp.diagnoses_icd
WHERE
(
SUBSTR(icd_code, 1, 3) = '585'
--筛选出 icd_code 字段以 '585' 开头并使用的是 ICD-9 版本的记录
AND
icd_version = 9
)
OR
(
SUBSTR(icd_code, 1, 3) = 'N18'
AND
icd_version = 10
)
GROUP BY 1
)
SELECT
p.hadm_id
, p.gender
, p.age
, lab.scr_min
, COALESCE(ckd.ckd_flag, 0) AS ckd
, p.MDRD_est
, CASE
WHEN lab.scr_min<=1.1 THEN scr_min
WHEN ckd.ckd_flag=1 THEN scr_min
ELSE MDRD_est END AS scr_baseline
FROM p
LEFT JOIN lab
ON p.hadm_id = lab.hadm_id
LEFT JOIN ckd
ON p.hadm_id = ckd.hadm_id
;
enzyme
-- 13.enzyme
-- begin query that extracts the data
create TABLE if not exists enzyme as
SELECT
MAX(subject_id) AS subject_id
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
, le.specimen_id
-- convert from itemid into a meaningful column
, MAX(CASE WHEN itemid = 50861 THEN valuenum ELSE NULL END) AS alt
, MAX(CASE WHEN itemid = 50863 THEN valuenum ELSE NULL END) AS alp
, MAX(CASE WHEN itemid = 50878 THEN valuenum ELSE NULL END) AS ast
, MAX(CASE WHEN itemid = 50867 THEN valuenum ELSE NULL END) AS amylase
, MAX(CASE WHEN itemid = 50885 THEN valuenum ELSE NULL END) AS bilirubin_total
, MAX(CASE WHEN itemid = 50883 THEN valuenum ELSE NULL END) AS bilirubin_direct
, MAX(CASE WHEN itemid = 50884 THEN valuenum ELSE NULL END) AS bilirubin_indirect
, MAX(CASE WHEN itemid = 50910 THEN valuenum ELSE NULL END) AS ck_cpk
, MAX(CASE WHEN itemid = 50911 THEN valuenum ELSE NULL END) AS ck_mb
, MAX(CASE WHEN itemid = 50927 THEN valuenum ELSE NULL END) AS ggt
, MAX(CASE WHEN itemid = 50954 THEN valuenum ELSE NULL END) AS ld_ldh
FROM mimiciv_hosp.labevents le
WHERE le.itemid IN
(
50861, -- Alanine transaminase (ALT)
50863, -- Alkaline phosphatase (ALP)
50878, -- Aspartate transaminase (AST)
50867, -- Amylase
50885, -- total bili
50884, -- indirect bili
50883, -- direct bili
50910, -- ck_cpk
50911, -- CK-MB
50927, -- Gamma Glutamyltransferase (GGT)
50954 -- ld_ldh
)
AND valuenum IS NOT NULL
-- lab values cannot be 0 and cannot be negative
AND valuenum > 0
GROUP BY le.specimen_id
;
gcs
-- 14.gcs
-- This query extracts the Glasgow Coma Scale, a measure of neurological function.
-- The query has a few special rules:
-- (1) The verbal component can be set to 0 if the patient is ventilated.
-- This is corrected to 5 - the overall GCS is set to 15 in these cases.
-- (2) Often only one of three components is documented. The other components
-- are carried forward.
-- ITEMIDs used:
-- METAVISION
-- 223900 GCS - Verbal Response
-- 223901 GCS - Motor Response
-- 220739 GCS - Eye Opening
-- Note:
-- The GCS for sedated patients is defaulted to 15 in this code.
-- This is in line with how the data is meant to be collected.
-- e.g., from the SAPS II publication:
-- For sedated patients, the Glasgow Coma Score before sedation was used.
-- This was ascertained either from interviewing the physician who ordered the sedation,
-- or by reviewing the patient's medical record.
create TABLE if not exists gcs as
with base as
(
select
subject_id
, ce.stay_id, ce.charttime
-- pivot each value into its own column
, max(case when ce.ITEMID = 223901 then ce.valuenum else null end) as GCSMotor
, max(case
when ce.ITEMID = 223900 and ce.VALUE = 'No Response-ETT' then 0
when ce.ITEMID = 223900 then ce.valuenum
else null
end) as GCSVerbal
, max(case when ce.ITEMID = 220739 then ce.valuenum else null end) as GCSEyes
-- convert the data into a number, reserving a value of 0 for ET/Trach
, max(case
-- endotrach/vent is assigned a value of 0
-- flag it here to later parse specially
when ce.ITEMID = 223900 and ce.VALUE = 'No Response-ETT' then 1 -- metavision
else 0 end)
as endotrachflag
, ROW_NUMBER ()
OVER (PARTITION BY ce.stay_id ORDER BY ce.charttime ASC) as rn
from mimiciv_icu.chartevents ce
-- Isolate the desired GCS variables
where ce.ITEMID in
(
-- GCS components, Metavision
223900, 223901, 220739
)
group by ce.subject_id, ce.stay_id, ce.charttime
)
, gcs as (
select b.*
, b2.GCSVerbal as GCSVerbalPrev
, b2.GCSMotor as GCSMotorPrev
, b2.GCSEyes as GCSEyesPrev
-- Calculate GCS, factoring in special case when they are intubated and prev vals
-- note that the coalesce are used to implement the following if:
-- if current value exists, use it
-- if previous value exists, use it
-- otherwise, default to normal
, case
-- replace GCS during sedation with 15
when b.GCSVerbal = 0
then 15
when b.GCSVerbal is null and b2.GCSVerbal = 0
then 15
-- if previously they were intub, but they aren't now, do not use previous GCS values
when b2.GCSVerbal = 0
then
coalesce(b.GCSMotor,6)
+ coalesce(b.GCSVerbal,5)
+ coalesce(b.GCSEyes,4)
-- otherwise, add up score normally, imputing previous value if none available at current time
else
coalesce(b.GCSMotor,coalesce(b2.GCSMotor,6))
+ coalesce(b.GCSVerbal,coalesce(b2.GCSVerbal,5))
+ coalesce(b.GCSEyes,coalesce(b2.GCSEyes,4))
end as GCS
from base b
-- join to itself within 6 hours to get previous value
left join base b2
on b.stay_id = b2.stay_id
and b.rn = b2.rn+1
and b2.charttime > DATETIME_ADD(b.charttime, INTERVAL '6' HOUR)
--检查 b2.charttime 是否晚于 b.charttime 之后的6小时。
)
-- combine components with previous within 6 hours
-- filter down to cohort which is not excluded
-- truncate charttime to the hour
, gcs_stg as
(
select
subject_id
, gs.stay_id, gs.charttime
, GCS
, coalesce(GCSMotor,GCSMotorPrev) as GCSMotor
, coalesce(GCSVerbal,GCSVerbalPrev) as GCSVerbal
, coalesce(GCSEyes,GCSEyesPrev) as GCSEyes
, case when coalesce(GCSMotor,GCSMotorPrev) is null then 0 else 1 end
+ case when coalesce(GCSVerbal,GCSVerbalPrev) is null then 0 else 1 end
+ case when coalesce(GCSEyes,GCSEyesPrev) is null then 0 else 1 end
as components_measured
, EndoTrachFlag
from gcs gs
)
-- priority is:
-- (i) complete data, (ii) non-sedated GCS, (iii) lowest GCS, (iv) charttime
, gcs_priority as
(
select
subject_id
, stay_id
, charttime
, gcs
, gcsmotor
, gcsverbal
, gcseyes
, EndoTrachFlag
, ROW_NUMBER() over
(
PARTITION BY stay_id, charttime
ORDER BY components_measured DESC, endotrachflag, gcs, charttime DESC
) as rn
from gcs_stg
)
select
gs.subject_id
, gs.stay_id
, gs.charttime
, GCS AS gcs
, GCSMotor AS gcs_motor
, GCSVerbal AS gcs_verbal
, GCSEyes AS gcs_eyes
, EndoTrachFlag AS gcs_unable
from gcs_priority gs
where rn = 1
;
原创文章(本站视频密码:66668888),作者:xujunzju,如若转载,请注明出处:https://zyicu.cn/?p=20172