JSON in Data Science und Machine Learning
Erfahren Sie, wie JSON in Data Science, Machine Learning und KI verwendet wird. Praktischer Leitfaden für Datenaustausch, Modellspeicherung und APIs.
Big JSON Team
• Technical WriterExpert in JSON data manipulation, API development, and web technologies. Passionate about creating tools that make developers' lives easier.
# JSON in Data Science und Machine Learning
JSON spielt eine zentrale Rolle in der modernen Data Science und ist das bevorzugte Format für Datenaustausch, Konfiguration und Modellspeicherung.
Warum JSON in Data Science?
Vorteile für Data Science
- Flexibilität - Unterstützt verschachtelte und komplexe Datenstrukturen
- Lesbarkeit - Einfach zu inspizieren und zu debuggen
- Interoperabilität - Funktioniert mit allen wichtigen Tools
- API-Kompatibilität - Standard für REST-APIs
- Versionskontrolle - Textbasiert für Git-Tracking
Typische Anwendungsfälle
JSON mit Python für Data Science
Grundlegende JSON-Operationen
import json
import pandas as pd
import numpy as np
# JSON-Datei laden
with open('data.json', 'r', encoding='utf-8') as file:
data = json.load(file)
# JSON-String parsen
json_string = '{"name": "Experiment", "accuracy": 0.95}'
experiment = json.loads(json_string)
# Daten in JSON schreiben
results = {
"model": "Random Forest",
"accuracy": 0.95,
"precision": 0.93
}
with open('results.json', 'w', encoding='utf-8') as file:
json.dump(results, file, indent=2)
NumPy Arrays serialisieren
import numpy as np
import json
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
return super(NumpyEncoder, self).default(obj)
# Beispiel
data = {
"features": np.array([1, 2, 3, 4, 5]),
"mean": np.float64(3.5),
"count": np.int64(5)
}
json_str = json.dumps(data, cls=NumpyEncoder, indent=2)
print(json_str)
Pandas DataFrames und JSON
import pandas as pd
# DataFrame erstellen
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'alter': [25, 30, 35],
'stadt': ['Berlin', 'München', 'Hamburg']
})
# DataFrame zu JSON (verschiedene Orientierungen)
# 1. Records-Format (Liste von Objekten)
json_records = df.to_json(orient='records', indent=2)
print(json_records)
# [
# {"name":"Alice","alter":25,"stadt":"Berlin"},
# {"name":"Bob","alter":30,"stadt":"München"},
# {"name":"Charlie","alter":35,"stadt":"Hamburg"}
# ]
# 2. Columns-Format (verschachtelt nach Spalten)
json_columns = df.to_json(orient='columns', indent=2)
# 3. Index-Format
json_index = df.to_json(orient='index', indent=2)
# JSON zurück zu DataFrame
df_from_json = pd.read_json(json_records, orient='records')
ML-Modell-Konfigurationen
Experiment-Konfiguration
{
"experiment": {
"name": "image_classification_v1",
"version": "1.0.0",
"created_at": "2026-01-26T10:00:00Z"
},
"model": {
"type": "convolutional_neural_network",
"architecture": "ResNet50",
"pretrained": true,
"freeze_layers": 15
},
"hyperparameters": {
"learning_rate": 0.001,
"batch_size": 32,
"epochs": 100,
"optimizer": "adam",
"loss_function": "categorical_crossentropy"
},
"data": {
"train_path": "/data/train",
"validation_path": "/data/val",
"test_path": "/data/test",
"augmentation": {
"rotation_range": 20,
"width_shift_range": 0.2,
"height_shift_range": 0.2,
"horizontal_flip": true
}
},
"callbacks": {
"early_stopping": {
"monitor": "val_loss",
"patience": 10,
"restore_best_weights": true
},
"checkpoint": {
"filepath": "models/best_model.h5",
"save_best_only": true
}
}
}
Konfiguration laden mit Python
import json
from dataclasses import dataclass
from typing import Dict, Any
@dataclass
class ExperimentConfig:
name: str
version: str
model_type: str
hyperparameters: Dict[str, Any]
@classmethod
def from_json(cls, config_path: str):
with open(config_path, 'r') as f:
config = json.load(f)
return cls(
name=config['experiment']['name'],
version=config['experiment']['version'],
model_type=config['model']['type'],
hyperparameters=config['hyperparameters']
)
# Verwendung
config = ExperimentConfig.from_json('experiment_config.json')
print(f"Experiment: {config.name}")
print(f"Learning Rate: {config.hyperparameters['learning_rate']}")
Trainingsergebnisse speichern
Experiment-Tracking
import json
import time
from datetime import datetime
class ExperimentTracker:
def __init__(self, experiment_name: str):
self.experiment_name = experiment_name
self.start_time = time.time()
self.results = {
"experiment": experiment_name,
"start_time": datetime.now().isoformat(),
"epochs": [],
"final_metrics": {}
}
def log_epoch(self, epoch: int, metrics: dict):
epoch_data = {
"epoch": epoch,
"timestamp": datetime.now().isoformat(),
"metrics": metrics
}
self.results["epochs"].append(epoch_data)
def finalize(self, final_metrics: dict):
self.results["final_metrics"] = final_metrics
self.results["end_time"] = datetime.now().isoformat()
self.results["duration_seconds"] = time.time() - self.start_time
# Speichern
filename = f"{self.experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w') as f:
json.dump(self.results, f, indent=2)
return filename
# Beispiel-Verwendung
tracker = ExperimentTracker("image_classifier_v1")
# Training simulieren
for epoch in range(1, 6):
metrics = {
"train_loss": 0.5 - (epoch 0.05),
"train_accuracy": 0.7 + (epoch 0.04),
"val_loss": 0.6 - (epoch 0.04),
"val_accuracy": 0.65 + (epoch 0.05)
}
tracker.log_epoch(epoch, metrics)
# Experiment abschließen
final_metrics = {
"best_val_accuracy": 0.89,
"test_accuracy": 0.87,
"parameters": 1250000
}
tracker.finalize(final_metrics)
JSON für ML-APIs
Model Serving mit FastAPI
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import numpy as np
import joblib
app = FastAPI()
# Modell laden
model = joblib.load('model.pkl')
class PredictionRequest(BaseModel):
features: list[float]
model_version: str = "1.0"
class PredictionResponse(BaseModel):
prediction: float
confidence: float
model_version: str
@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
try:
# Features vorbereiten
features = np.array(request.features).reshape(1, -1)
# Vorhersage
prediction = model.predict(features)[0]
probability = model.predict_proba(features)[0]
confidence = float(max(probability))
return PredictionResponse(
prediction=float(prediction),
confidence=confidence,
model_version=request.model_version
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# Batch-Vorhersage
class BatchPredictionRequest(BaseModel):
instances: list[list[float]]
@app.post("/predict/batch")
async def batch_predict(request: BatchPredictionRequest):
try:
features = np.array(request.instances)
predictions = model.predict(features)
return {
"predictions": predictions.tolist(),
"count": len(predictions)
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
Dataset-Metadaten
COCO-Format für Computer Vision
{
"info": {
"description": "Custom Object Detection Dataset",
"version": "1.0",
"year": 2026,
"date_created": "2026-01-26"
},
"licenses": [
{
"id": 1,
"name": "MIT License",
"url": "https://opensource.org/licenses/MIT"
}
],
"images": [
{
"id": 1,
"file_name": "image001.jpg",
"width": 1920,
"height": 1080,
"date_captured": "2026-01-15"
}
],
"annotations": [
{
"id": 1,
"image_id": 1,
"category_id": 1,
"bbox": [100, 150, 200, 300],
"area": 60000,
"segmentation": [],
"iscrowd": 0
}
],
"categories": [
{
"id": 1,
"name": "person",
"supercategory": "human"
}
]
}
Dataset-Metadaten verarbeiten
import json
from pathlib import Path
from typing import List, Dict
class DatasetMetadata:
def __init__(self, metadata_path: str):
with open(metadata_path, 'r') as f:
self.metadata = json.load(f)
def get_images(self) -> List[Dict]:
return self.metadata['images']
def get_annotations_for_image(self, image_id: int) -> List[Dict]:
return [
ann for ann in self.metadata['annotations']
if ann['image_id'] == image_id
]
def get_category_name(self, category_id: int) -> str:
categories = {
cat['id']: cat['name']
for cat in self.metadata['categories']
}
return categories.get(category_id, 'unknown')
def statistics(self) -> Dict:
return {
"total_images": len(self.metadata['images']),
"total_annotations": len(self.metadata['annotations']),
"categories": len(self.metadata['categories'])
}
# Verwendung
dataset = DatasetMetadata('annotations.json')
print(dataset.statistics())
JSON für Feature Engineering
Feature-Store-Schema
{
"feature_group": {
"name": "customer_features",
"version": "v1",
"created_at": "2026-01-26T10:00:00Z",
"features": [
{
"name": "customer_lifetime_value",
"type": "float",
"description": "Total value of customer purchases",
"statistics": {
"min": 0.0,
"max": 50000.0,
"mean": 2500.0,
"std": 5000.0
}
},
{
"name": "days_since_last_purchase",
"type": "integer",
"description": "Days since customer's last purchase",
"statistics": {
"min": 0,
"max": 365,
"mean": 45,
"std": 30
}
},
{
"name": "customer_segment",
"type": "categorical",
"description": "Customer segmentation category",
"categories": ["bronze", "silver", "gold", "platinum"]
}
]
},
"transformations": [
{
"name": "normalize_clv",
"type": "standardization",
"feature": "customer_lifetime_value"
},
{
"name": "encode_segment",
"type": "one_hot_encoding",
"feature": "customer_segment"
}
]
}
JSON Lines (JSONL) für Big Data
JSONL-Format
import json
# JSONL schreiben
data = [
{"id": 1, "text": "Erste Zeile", "label": 1},
{"id": 2, "text": "Zweite Zeile", "label": 0},
{"id": 3, "text": "Dritte Zeile", "label": 1}
]
with open('data.jsonl', 'w', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
# JSONL lesen
def read_jsonl(filepath: str):
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
yield json.loads(line.strip())
# Verwendung
for record in read_jsonl('data.jsonl'):
print(record)
Streaming mit JSONL
import json
from typing import Iterator, Dict
class JSONLProcessor:
def __init__(self, input_path: str, output_path: str):
self.input_path = input_path
self.output_path = output_path
def process_stream(self, transform_fn):
"""Verarbeite JSONL im Streaming-Modus"""
with open(self.input_path, 'r') as infile, \
open(self.output_path, 'w') as outfile:
for line in infile:
record = json.loads(line.strip())
transformed = transform_fn(record)
outfile.write(json.dumps(transformed) + '\n')
def filter_stream(self, filter_fn) -> Iterator[Dict]:
"""Filtere JSONL im Streaming-Modus"""
with open(self.input_path, 'r') as f:
for line in f:
record = json.loads(line.strip())
if filter_fn(record):
yield record
# Beispiel
processor = JSONLProcessor('input.jsonl', 'output.jsonl')
# Transformation anwenden
def add_processed_flag(record):
record['processed'] = True
return record
processor.process_stream(add_processed_flag)
# Filtern
filtered = processor.filter_stream(lambda r: r.get('label') == 1)
for record in filtered:
print(record)
Best Practices
1. Effiziente Serialisierung
import json
import orjson # Schnellere Alternative
# Standard JSON
data = {"large": "dataset" * 1000}
json_str = json.dumps(data)
# orjson (schneller)
orjson_bytes = orjson.dumps(data)
orjson_str = orjson_bytes.decode('utf-8')
2. Schema-Validierung
from jsonschema import validate, ValidationError
schema = {
"type": "object",
"properties": {
"model_name": {"type": "string"},
"accuracy": {"type": "number", "minimum": 0, "maximum": 1},
"epochs": {"type": "integer", "minimum": 1}
},
"required": ["model_name", "accuracy"]
}
data = {
"model_name": "ResNet50",
"accuracy": 0.95,
"epochs": 100
}
try:
validate(instance=data, schema=schema)
print("Validierung erfolgreich")
except ValidationError as e:
print(f"Validierungsfehler: {e.message}")
3. Kompression für große Dateien
import json
import gzip
# Mit Kompression schreiben
data = {"large": "dataset"}
with gzip.open('data.json.gz', 'wt', encoding='utf-8') as f:
json.dump(data, f)
# Mit Kompression lesen
with gzip.open('data.json.gz', 'rt', encoding='utf-8') as f:
loaded_data = json.load(f)
Zusammenfassung
JSON ist unverzichtbar in der Data Science für:
- Konfigurationsmanagement - Hyperparameter und Experiment-Setups
- Datenaustausch - APIs und Microservices
- Metadaten - Dataset-Beschreibungen und Annotationen
- Ergebnisspeicherung - Training-Logs und Metriken
- Feature Engineering - Feature-Store-Definitionen
Mit den richtigen Tools und Best Practices ist JSON ein leistungsstarkes Format für Data Science Workflows.
Verwandte Artikel
Python und JSON: Vollständiger Leitfaden zur Datenverarbeitung
Meistern Sie JSON in Python: Lesen, Schreiben, Parsen, Validieren und Verarbeiten von JSON-Daten mit praktischen Beispielen und Best Practices.
JSON APIs und REST-Dienste: Vollständiger Entwickler-Leitfaden
Meistern Sie JSON in REST-APIs: Best Practices, Beispiele, Authentifizierung, Fehlerbehandlung und moderne API-Entwicklung.
Arbeiten mit großen JSON-Dateien: Optimierung und Best Practices
Lernen Sie, wie man große JSON-Dateien effizient verarbeitet. Streaming, Parsing-Optimierung und Performance-Tipps für Big Data.