Data Model – Canonical Patient Bundle¶
Overview¶
The Canonical Patient Bundle is the single source of truth for all patient oncology data in Entheory.AI. It is a comprehensive JSON document that aggregates data from multiple hospital systems into a unified, validated structure.
Design Principles¶
- Single Source of Truth: One bundle per patient contains ALL clinical data
- Append-Only: Historical data never deleted, only marked as
superseded - Provenance: Every data element includes source system and timestamp
- Validation: Strictly validated against JSON schema before persistence
- FHIR-Aligned: Structure maps cleanly to FHIR R4 resources
File Organization¶
src/data/patients/
├─ ABHA-12345678901/
│ ├─ bundle.json # Current canonical bundle
│ ├─ bundle.backup.json # Previous version (for rollback)
│ └─ bundle.pending.json # Partial data (validation failed)
├─ ABHA-98765432102/
│ └─ bundle.json
└─ ...
src/data/processed_patients.json # Aggregated cache for patient list
Bundle Schema (Top-Level)¶
interface PatientBundle {
// Metadata
bundleVersion: string; // "1.0"
patientId: string; // Internal ID: "case_001"
abhaId: string; // ABHA ID: "ABHA-12345678901"
lastUpdated: string; // ISO 8601 timestamp
// Core Data Sections
demographics: Demographics;
cancer: CancerData;
vitals: Vitals[];
labs: LabResult[];
imaging: ImagingStudy[];
pathology: PathologyReport[];
genomics: GenomicsData;
therapy: TreatmentLine[];
medications: Medication[];
allergies: Allergy[];
documents: Document[]; // OCR outputs
transcripts: Transcript[]; // ASR outputs
timeline: TimelineEvent[];
// Provenance & Audit
provenance: Provenance;
}
Section Schemas¶
1. Demographics¶
interface Demographics {
name: {
firstName: string;
middleName?: string;
lastName: string;
fullName: string; // Computed: for display
};
dateOfBirth: string; // YYYY-MM-DD
age: number; // Computed from DOB
gender: "Male" | "Female" | "Other" | "Unknown";
contactInfo: {
phone?: string;
email?: string;
address?: {
line1: string;
line2?: string;
city: string;
state: string;
pincode: string;
country: "India"; // MVP: India only
};
};
identifiers: {
abhaId: string; // Primary
hospitalMrn: string; // Hospital Medical Record Number
aadharLastFour?: string; // Last 4 digits only (privacy)
};
emergencyContact?: {
name: string;
relationship: string;
phone: string;
};
insurance?: {
type: "CGHS" | "ECHS" | "Ayushman Bharat" | "Private" | "Self-Pay";
policyNumber?: string;
validUntil?: string;
};
}
2. Cancer Data (Core)¶
interface CancerData {
primarySite: string; // "Breast", "Lung", "Colorectal", etc.
histology: string; // "Invasive Ductal Carcinoma", "Adeno", etc.
stage: {
clinical?: string; // "IIA", "IIIB", etc.
pathologic?: string;
tnm?: {
t: string; // "T2"
n: string; // "N1"
m: string; // "M0"
};
};
grade?: "G1" | "G2" | "G3" | "GX";
diagnosisDate: string; // YYYY-MM-DD
diagnosisSource: "Biopsy" | "Imaging" | "Clinical" | "Unknown";
metastases?: {
present: boolean;
sites: string[]; // ["Liver", "Bone"]
};
biomarkers?: { // Key prognostic/predictive markers
[key: string]: {
name: string; // "ER", "PR", "HER2", "PD-L1"
value: string; // "Positive", "Negative", "95%"
date: string;
source: string; // Report ID or system
};
};
performanceStatus?: {
ecog?: 0 | 1 | 2 | 3 | 4 | 5; // ECOG scale
karnofsky?: number; // 0-100
assessmentDate: string;
};
}
3. Vitals¶
interface Vitals {
vitalId: string; // "vital_2024-12-03_001"
date: string; // YYYY-MM-DD
time?: string; // HH:MM
weight?: {
value: number;
unit: "kg";
};
height?: {
value: number;
unit: "cm";
};
bmi?: number; // Computed: weight(kg) / (height(m)^2)
bloodPressure?: {
systolic: number; // mmHg
diastolic: number;
};
heartRate?: number; // bpm
respiratoryRate?: number; // breaths/min
temperature?: {
value: number;
unit: "C" | "F";
};
spO2?: number; // 0-100%
source: string; // "EMR-HIMS", "Manual Entry"
recordedBy?: string; // Nurse/physician name
}
4. Lab Results¶
interface LabResult {
labId: string; // "lab_2024-12-03_oru_123"
testName: string; // "Hemoglobin", "Creatinine"
loincCode?: string; // LOINC code if available
value: number | string; // 12.5 or "Positive"
unit?: string; // "g/dL", "mg/dL"
referenceRange?: string; // "12-16 g/dL"
status: "normal" | "abnormal" | "critical" | "unknown";
abnormalFlag?: "H" | "L" | "HH" | "LL"; // High, Low, Critical High/Low
date: string; // YYYY-MM-DD
time?: string;
category: "Hematology" | "Chemistry" | "Tumor Marker" | "Coagulation" | "Urinalysis" | "Other";
delta?: { // Change from previous value
value: number;
percent: number;
direction: "up" | "down" | "stable";
comparedTo: string; // labId of previous test
};
provenance: {
source: string; // "LIS-HOSPITALX"
hl7MessageId?: string; // Control ID from HL7 message
receivedAt: string;
};
}
Example Lab Categories:
- Hematology: CBC, WBC, Hemoglobin, Platelets
- Chemistry: BUN, Creatinine, ALT, AST, Bilirubin, Albumin
- Tumor Markers: CA-125, CA 15-3, CEA, PSA, AFP, CA 19-9
- Coagulation: PT, INR, aPTT
- Urinalysis: Specific gravity, protein, glucose
5. Imaging Studies¶
interface ImagingStudy {
studyId: string; // DICOM Study Instance UID
studyDate: string;
modality: "CT" | "MRI" | "PET" | "X-Ray" | "Ultrasound" | "Mammography" | "PET-CT";
bodyPart: string; // "Chest", "Abdomen", "Brain"
description?: string; // "CT Chest with Contrast"
findings?: string; // Radiologist impression (text)
measurements?: { // Structured data if available
recist?: { // RECIST target lesions
targetLesions: {
lesionId: string;
location: string;
diameter: number; // mm
}[];
sumOfDiameters: number;
};
suvmax?: number; // PET SUVmax value
};
assets?: { // DICOM file links
manifestPath?: string;
assetCount?: number;
completeness: number; // 0.0-1.0 (% of assets available)
viewerUrl?: string; // Link to PACS viewer
};
status: "complete" | "incomplete" | "preliminary";
provenance: {
source: string; // "PACS-HOSPITALX"
receivedAt: string;
};
}
6. Pathology Reports¶
interface PathologyReport {
reportId: string;
reportDate: string;
reportType: "Biopsy" | "Surgical Pathology" | "Cytology";
specimen: {
type: string; // "Breast lumpectomy"
site: string; // "Left breast, upper outer quadrant"
collectionDate?: string;
};
diagnosis: {
primary: string; // "Invasive Ductal Carcinoma"
histology?: string;
grade?: "G1" | "G2" | "G3" | "GX";
};
biomarkers?: {
[key: string]: {
name: string; // "ER", "PR", "HER2", "Ki67"
result: string; // "Positive (95%)"
method?: string; // "IHC", "FISH"
};
};
margins?: {
status: "Negative" | "Positive" | "Close";
closestMargin?: string; // "2mm"
};
lymphNodes?: {
examined: number;
positive: number;
};
fullReportText?: string; // Free text from pathologist
provenance: {
source: string; // "Pathology-HOSPITALX"
pathologist?: string;
receivedAt: string;
};
}
7. Genomics Data¶
interface GenomicsData {
testDate: string;
testType: "NGS Panel" | "WES" | "WGS" | "Single Gene" | "Other";
panelName?: string; // "FoundationOne CDx", "OncoPanel"
somaticMutations?: {
gene: string; // "EGFR", "KRAS", "TP53"
variant: string; // HGVS notation: "NM_005228.3:c.2235_2249del"
vaf: number; // Variant Allele Frequency (0-100%)
classification: "Pathogenic" | "Likely Pathogenic" | "VUS" | "Likely Benign" | "Benign";
actionable: boolean; // Clinically actionable per guidelines
}[];
biomarkers?: {
tmb?: { // Tumor Mutational Burden
value: number; // mutations/megabase
interpretation: "High" | "Intermediate" | "Low";
};
msi?: { // Microsatellite Instability
status: "MSI-H" | "MSS";
};
pdl1?: {
tps?: number; // Tumor Proportion Score (%)
cps?: number; // Combined Positive Score
method: "IHC 22C3" | "IHC SP263" | "Other";
};
};
copyNumberVariations?: {
gene: string;
type: "Amplification" | "Deletion";
copyNumber?: number;
}[];
fullReportPath?: string; // Path to JSON or PDF report
provenance: {
source: string; // "Genomics-Lab-X"
receivedAt: string;
};
}
8. Treatment Lines¶
interface TreatmentLine {
lineId: string;
lineNumber: number; // 1st line, 2nd line, etc.
intent: "Curative" | "Adjuvant" | "Neoadjuvant" | "Palliative";
regimenName: string; // "FOLFOX", "Pembrolizumab", "AC-T"
drugs?: string[]; // ["5-FU", "Oxaliplatin", "Leucovorin"]
startDate: string;
endDate?: string;
status: "Planned" | "Ongoing" | "Completed" | "Discontinued";
discontinuationReason?: string; // "Toxicity", "Progression", "Patient Request"
cycles?: {
planned?: number;
completed?: number;
};
response?: {
assessment: "CR" | "PR" | "SD" | "PD" | "Not Assessed";
assessmentDate: string;
method: "RECIST" | "Clinical" | "Imaging";
};
adverseEvents?: {
event: string;
grade: 1 | 2 | 3 | 4 | 5; // CTCAE grade
date: string;
}[];
provenance: {
source: string;
enteredBy?: string;
};
}
9. Medications & Allergies¶
interface Medication {
medicationId: string;
name: string;
genericName?: string;
dosage?: string; // "500mg"
route?: "Oral" | "IV" | "IM" | "Subcutaneous" | "Other";
frequency?: string; // "BID", "TID", "weekly"
category: "Chemotherapy" | "Targeted" | "Immunotherapy" | "Supportive" | "Comorbidity";
startDate?: string;
endDate?: string;
status: "Active" | "Discontinued" | "On Hold";
prescribedBy?: string;
}
interface Allergy {
allergyId: string;
allergen: string; // Drug name or substance
reaction?: string; // "Anaphylaxis", "Rash", "Nausea"
severity: "Mild" | "Moderate" | "Severe" | "Life-Threatening";
onsetDate?: string;
reportedBy?: string;
}
10. Documents (OCR Outputs)¶
interface Document {
documentId: string;
type: "Discharge Summary" | "Lab Report" | "Imaging Report" | "Consult Note" | "Other";
filePath: string; // S3 path to original PDF
hash: string; // SHA-256 hash of file
extractedText: string; // OCR output
language: "en" | "hi-IN";
ocrEngine: string; // "tesseract-5.3.0"
ocrMode?: string; // "lstm"
confidence: number; // 0.0-1.0
uploadDate: string;
uploadedBy?: string;
needsReview: boolean; // True if confidence <0.70
}
11. Transcripts (ASR Outputs)¶
interface Transcript {
transcriptId: string;
audioPath: string; // S3 path to audio file
transcript: string; // Full text
language: "en-IN" | "hi-IN";
asrModel: string; // "whisper-large-v3"
confidence: number;
duration: string; // "00:04:32"
recordingDate: string;
consentFlag: boolean; // Patient consent obtained
timestamps?: { // Word-level timestamps (optional)
start: number;
end: number;
word: string;
}[];
}
12. Timeline Events¶
interface TimelineEvent {
eventId: string;
eventType: "Diagnosis" | "Admission" | "Discharge" | "Procedure" | "Treatment Start" | "Lab" | "Imaging" | "Note";
date: string;
time?: string;
title: string; // "Chemotherapy Cycle 1 Started"
description?: string;
linkedDataId?: string; // Reference to labs[].labId, imaging[].studyId, etc.
linkedDataType?: "lab" | "imaging" | "pathology" | "genomics" | "treatment" | "document";
source: string;
}
13. Provenance & Audit¶
interface Provenance {
createdAt: string; // First bundle creation
lastUpdated: string; // Most recent update
dataSources: {
name: string; // "EMR-HOSPITALX", "LIS-HOSPITALX"
type: "HL7" | "FHIR" | "JSON" | "Manual Entry";
lastSync?: string;
recordCount?: number; // How many records from this source
}[];
updateHistory?: {
timestamp: string;
updatedBy: string; // System or user
changeType: "Ingestion" | "Manual Edit" | "Correction";
changeDescription: string;
}[];
}
Validation Rules¶
Required Fields¶
bundleVersion,patientId,abhaId,lastUpdateddemographics.name.fullNamedemographics.dateOfBirthdemographics.gendercancer.primarySite,cancer.diagnosisDate
Data Integrity¶
- All dates in
YYYY-MM-DDformat - All timestamps in ISO 8601 format
- ABHA ID: 12-digit numeric string
- Email: Valid email format (if provided)
- Phone: 10-digit Indian mobile format (if provided)
Referential Integrity¶
timeline[].linkedDataIdmust reference valid item in corresponding arraylabs[].delta.comparedTomust reference validlabId
Example Bundle (Simplified)¶
{
"bundleVersion": "1.0",
"patientId": "case_001",
"abhaId": "ABHA-12345678901",
"lastUpdated": "2024-12-03T10:15:30Z",
"demographics": {
"name": {
"firstName": "Priya",
"lastName": "Sharma",
"fullName": "Priya Sharma"
},
"dateOfBirth": "1975-06-15",
"age": 49,
"gender": "Female",
"identifiers": {
"abhaId": "ABHA-12345678901",
"hospitalMrn": "MRN-H1-98765"
}
},
"cancer": {
"primarySite": "Breast",
"histology": "Invasive Ductal Carcinoma",
"stage": {
"pathologic": "IIB",
"tnm": { "t": "T2", "n": "N1", "m": "M0" }
},
"grade": "G2",
"diagnosisDate": "2024-01-15",
"biomarkers": {
"ER": { "name": "ER", "value": "Positive (95%)", "date": "2024-01-20", "source": "path_report_001" },
"PR": { "name": "PR", "value": "Positive (80%)", "date": "2024-01-20", "source": "path_report_001" },
"HER2": { "name": "HER2", "value": "Negative", "date": "2024-01-20", "source": "path_report_001" }
}
},
"labs": [
{
"labId": "lab_2024-12-01_001",
"testName": "Hemoglobin",
"value": 12.8,
"unit": "g/dL",
"referenceRange": "12-16 g/dL",
"status": "normal",
"date": "2024-12-01",
"category": "Hematology",
"delta": {
"value": -0.2,
"percent": -1.5,
"direction": "down",
"comparedTo": "lab_2024-11-15_001"
},
"provenance": {
"source": "LIS-HOSPITALX",
"hl7MessageId": "ORU-20241201-001",
"receivedAt": "2024-12-01T08:30:00Z"
}
}
],
"therapy": [
{
"lineId": "tx_line_1",
"lineNumber": 1,
"intent": "Adjuvant",
"regimenName": "AC-T",
"drugs": ["Doxorubicin", "Cyclophosphamide", "Paclitaxel"],
"startDate": "2024-03-01",
"endDate": "2024-08-15",
"status": "Completed",
"cycles": { "planned": 8, "completed": 8 },
"response": {
"assessment": "CR",
"assessmentDate": "2024-09-01",
"method": "Imaging"
}
}
],
"provenance": {
"createdAt": "2024-01-15T12:00:00Z",
"lastUpdated": "2024-12-03T10:15:30Z",
"dataSources": [
{ "name": "EMR-HOSPITALX", "type": "HL7", "lastSync": "2024-12-03T10:00:00Z", "recordCount": 45 },
{ "name": "LIS-HOSPITALX", "type": "HL7", "lastSync": "2024-12-01T08:30:00Z", "recordCount": 128 },
{ "name": "PACS-HOSPITALX", "type": "JSON", "lastSync": "2024-11-28T14:00:00Z", "recordCount": 12 }
]
}
}
FHIR Mapping¶
| Bundle Section | FHIR Resource |
|---|---|
demographics |
Patient |
cancer |
Condition |
vitals |
Observation (category: vital-signs) |
labs |
Observation (category: laboratory) |
imaging |
ImagingStudy |
pathology |
DiagnosticReport (category: PAT) |
genomics |
DiagnosticReport (category: GE) + Observation (variants) |
therapy |
MedicationStatement, Procedure |
medications |
MedicationStatement |
allergies |
AllergyIntolerance |
timeline |
(No direct mapping, derived view) |
Document Owner: Data Architect / Backend Lead
Last Updated: 2024-12-03
Related: High-Level Architecture | APIs