← Zurück zum Blog

JSON in Data Science und Machine Learning

Erfahren Sie, wie JSON in Data Science, Machine Learning und KI verwendet wird. Praktischer Leitfaden für Datenaustausch, Modellspeicherung und APIs.

Big JSON Team15 Min. Lesezeitadvanced
B

Big JSON Team

Technical Writer

Expert in JSON data manipulation, API development, and web technologies. Passionate about creating tools that make developers' lives easier.

15 Min. Lesezeit

# JSON in Data Science und Machine Learning

JSON spielt eine zentrale Rolle in der modernen Data Science und ist das bevorzugte Format für Datenaustausch, Konfiguration und Modellspeicherung.

Warum JSON in Data Science?

Vorteile für Data Science

  • Flexibilität - Unterstützt verschachtelte und komplexe Datenstrukturen
  • Lesbarkeit - Einfach zu inspizieren und zu debuggen
  • Interoperabilität - Funktioniert mit allen wichtigen Tools
  • API-Kompatibilität - Standard für REST-APIs
  • Versionskontrolle - Textbasiert für Git-Tracking

Typische Anwendungsfälle

  • Datenaustausch zwischen Systemen
  • Konfiguration von ML-Modellen
  • Speicherung von Trainingsergebnissen
  • API-Responses von ML-Services
  • Metadaten für Datasets
  • JSON mit Python für Data Science

    Grundlegende JSON-Operationen

    import json
    

    import pandas as pd

    import numpy as np

    # JSON-Datei laden

    with open('data.json', 'r', encoding='utf-8') as file:

    data = json.load(file)

    # JSON-String parsen

    json_string = '{"name": "Experiment", "accuracy": 0.95}'

    experiment = json.loads(json_string)

    # Daten in JSON schreiben

    results = {

    "model": "Random Forest",

    "accuracy": 0.95,

    "precision": 0.93

    }

    with open('results.json', 'w', encoding='utf-8') as file:

    json.dump(results, file, indent=2)

    NumPy Arrays serialisieren

    import numpy as np
    

    import json

    class NumpyEncoder(json.JSONEncoder):

    def default(self, obj):

    if isinstance(obj, np.ndarray):

    return obj.tolist()

    if isinstance(obj, np.integer):

    return int(obj)

    if isinstance(obj, np.floating):

    return float(obj)

    return super(NumpyEncoder, self).default(obj)

    # Beispiel

    data = {

    "features": np.array([1, 2, 3, 4, 5]),

    "mean": np.float64(3.5),

    "count": np.int64(5)

    }

    json_str = json.dumps(data, cls=NumpyEncoder, indent=2)

    print(json_str)

    Pandas DataFrames und JSON

    import pandas as pd
    
    

    # DataFrame erstellen

    df = pd.DataFrame({

    'name': ['Alice', 'Bob', 'Charlie'],

    'alter': [25, 30, 35],

    'stadt': ['Berlin', 'München', 'Hamburg']

    })

    # DataFrame zu JSON (verschiedene Orientierungen)

    # 1. Records-Format (Liste von Objekten)

    json_records = df.to_json(orient='records', indent=2)

    print(json_records)

    # [

    # {"name":"Alice","alter":25,"stadt":"Berlin"},

    # {"name":"Bob","alter":30,"stadt":"München"},

    # {"name":"Charlie","alter":35,"stadt":"Hamburg"}

    # ]

    # 2. Columns-Format (verschachtelt nach Spalten)

    json_columns = df.to_json(orient='columns', indent=2)

    # 3. Index-Format

    json_index = df.to_json(orient='index', indent=2)

    # JSON zurück zu DataFrame

    df_from_json = pd.read_json(json_records, orient='records')

    ML-Modell-Konfigurationen

    Experiment-Konfiguration

    {
    

    "experiment": {

    "name": "image_classification_v1",

    "version": "1.0.0",

    "created_at": "2026-01-26T10:00:00Z"

    },

    "model": {

    "type": "convolutional_neural_network",

    "architecture": "ResNet50",

    "pretrained": true,

    "freeze_layers": 15

    },

    "hyperparameters": {

    "learning_rate": 0.001,

    "batch_size": 32,

    "epochs": 100,

    "optimizer": "adam",

    "loss_function": "categorical_crossentropy"

    },

    "data": {

    "train_path": "/data/train",

    "validation_path": "/data/val",

    "test_path": "/data/test",

    "augmentation": {

    "rotation_range": 20,

    "width_shift_range": 0.2,

    "height_shift_range": 0.2,

    "horizontal_flip": true

    }

    },

    "callbacks": {

    "early_stopping": {

    "monitor": "val_loss",

    "patience": 10,

    "restore_best_weights": true

    },

    "checkpoint": {

    "filepath": "models/best_model.h5",

    "save_best_only": true

    }

    }

    }

    Konfiguration laden mit Python

    import json
    

    from dataclasses import dataclass

    from typing import Dict, Any

    @dataclass

    class ExperimentConfig:

    name: str

    version: str

    model_type: str

    hyperparameters: Dict[str, Any]

    @classmethod

    def from_json(cls, config_path: str):

    with open(config_path, 'r') as f:

    config = json.load(f)

    return cls(

    name=config['experiment']['name'],

    version=config['experiment']['version'],

    model_type=config['model']['type'],

    hyperparameters=config['hyperparameters']

    )

    # Verwendung

    config = ExperimentConfig.from_json('experiment_config.json')

    print(f"Experiment: {config.name}")

    print(f"Learning Rate: {config.hyperparameters['learning_rate']}")

    Trainingsergebnisse speichern

    Experiment-Tracking

    import json
    

    import time

    from datetime import datetime

    class ExperimentTracker:

    def __init__(self, experiment_name: str):

    self.experiment_name = experiment_name

    self.start_time = time.time()

    self.results = {

    "experiment": experiment_name,

    "start_time": datetime.now().isoformat(),

    "epochs": [],

    "final_metrics": {}

    }

    def log_epoch(self, epoch: int, metrics: dict):

    epoch_data = {

    "epoch": epoch,

    "timestamp": datetime.now().isoformat(),

    "metrics": metrics

    }

    self.results["epochs"].append(epoch_data)

    def finalize(self, final_metrics: dict):

    self.results["final_metrics"] = final_metrics

    self.results["end_time"] = datetime.now().isoformat()

    self.results["duration_seconds"] = time.time() - self.start_time

    # Speichern

    filename = f"{self.experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(filename, 'w') as f:

    json.dump(self.results, f, indent=2)

    return filename

    # Beispiel-Verwendung

    tracker = ExperimentTracker("image_classifier_v1")

    # Training simulieren

    for epoch in range(1, 6):

    metrics = {

    "train_loss": 0.5 - (epoch 0.05),

    "train_accuracy": 0.7 + (epoch 0.04),

    "val_loss": 0.6 - (epoch 0.04),

    "val_accuracy": 0.65 + (epoch 0.05)

    }

    tracker.log_epoch(epoch, metrics)

    # Experiment abschließen

    final_metrics = {

    "best_val_accuracy": 0.89,

    "test_accuracy": 0.87,

    "parameters": 1250000

    }

    tracker.finalize(final_metrics)

    JSON für ML-APIs

    Model Serving mit FastAPI

    from fastapi import FastAPI, HTTPException
    

    from pydantic import BaseModel

    import numpy as np

    import joblib

    app = FastAPI()

    # Modell laden

    model = joblib.load('model.pkl')

    class PredictionRequest(BaseModel):

    features: list[float]

    model_version: str = "1.0"

    class PredictionResponse(BaseModel):

    prediction: float

    confidence: float

    model_version: str

    @app.post("/predict", response_model=PredictionResponse)

    async def predict(request: PredictionRequest):

    try:

    # Features vorbereiten

    features = np.array(request.features).reshape(1, -1)

    # Vorhersage

    prediction = model.predict(features)[0]

    probability = model.predict_proba(features)[0]

    confidence = float(max(probability))

    return PredictionResponse(

    prediction=float(prediction),

    confidence=confidence,

    model_version=request.model_version

    )

    except Exception as e:

    raise HTTPException(status_code=500, detail=str(e))

    # Batch-Vorhersage

    class BatchPredictionRequest(BaseModel):

    instances: list[list[float]]

    @app.post("/predict/batch")

    async def batch_predict(request: BatchPredictionRequest):

    try:

    features = np.array(request.instances)

    predictions = model.predict(features)

    return {

    "predictions": predictions.tolist(),

    "count": len(predictions)

    }

    except Exception as e:

    raise HTTPException(status_code=500, detail=str(e))

    Dataset-Metadaten

    COCO-Format für Computer Vision

    {
    

    "info": {

    "description": "Custom Object Detection Dataset",

    "version": "1.0",

    "year": 2026,

    "date_created": "2026-01-26"

    },

    "licenses": [

    {

    "id": 1,

    "name": "MIT License",

    "url": "https://opensource.org/licenses/MIT"

    }

    ],

    "images": [

    {

    "id": 1,

    "file_name": "image001.jpg",

    "width": 1920,

    "height": 1080,

    "date_captured": "2026-01-15"

    }

    ],

    "annotations": [

    {

    "id": 1,

    "image_id": 1,

    "category_id": 1,

    "bbox": [100, 150, 200, 300],

    "area": 60000,

    "segmentation": [],

    "iscrowd": 0

    }

    ],

    "categories": [

    {

    "id": 1,

    "name": "person",

    "supercategory": "human"

    }

    ]

    }

    Dataset-Metadaten verarbeiten

    import json
    

    from pathlib import Path

    from typing import List, Dict

    class DatasetMetadata:

    def __init__(self, metadata_path: str):

    with open(metadata_path, 'r') as f:

    self.metadata = json.load(f)

    def get_images(self) -> List[Dict]:

    return self.metadata['images']

    def get_annotations_for_image(self, image_id: int) -> List[Dict]:

    return [

    ann for ann in self.metadata['annotations']

    if ann['image_id'] == image_id

    ]

    def get_category_name(self, category_id: int) -> str:

    categories = {

    cat['id']: cat['name']

    for cat in self.metadata['categories']

    }

    return categories.get(category_id, 'unknown')

    def statistics(self) -> Dict:

    return {

    "total_images": len(self.metadata['images']),

    "total_annotations": len(self.metadata['annotations']),

    "categories": len(self.metadata['categories'])

    }

    # Verwendung

    dataset = DatasetMetadata('annotations.json')

    print(dataset.statistics())

    JSON für Feature Engineering

    Feature-Store-Schema

    {
    

    "feature_group": {

    "name": "customer_features",

    "version": "v1",

    "created_at": "2026-01-26T10:00:00Z",

    "features": [

    {

    "name": "customer_lifetime_value",

    "type": "float",

    "description": "Total value of customer purchases",

    "statistics": {

    "min": 0.0,

    "max": 50000.0,

    "mean": 2500.0,

    "std": 5000.0

    }

    },

    {

    "name": "days_since_last_purchase",

    "type": "integer",

    "description": "Days since customer's last purchase",

    "statistics": {

    "min": 0,

    "max": 365,

    "mean": 45,

    "std": 30

    }

    },

    {

    "name": "customer_segment",

    "type": "categorical",

    "description": "Customer segmentation category",

    "categories": ["bronze", "silver", "gold", "platinum"]

    }

    ]

    },

    "transformations": [

    {

    "name": "normalize_clv",

    "type": "standardization",

    "feature": "customer_lifetime_value"

    },

    {

    "name": "encode_segment",

    "type": "one_hot_encoding",

    "feature": "customer_segment"

    }

    ]

    }

    JSON Lines (JSONL) für Big Data

    JSONL-Format

    import json
    
    

    # JSONL schreiben

    data = [

    {"id": 1, "text": "Erste Zeile", "label": 1},

    {"id": 2, "text": "Zweite Zeile", "label": 0},

    {"id": 3, "text": "Dritte Zeile", "label": 1}

    ]

    with open('data.jsonl', 'w', encoding='utf-8') as f:

    for item in data:

    f.write(json.dumps(item, ensure_ascii=False) + '\n')

    # JSONL lesen

    def read_jsonl(filepath: str):

    with open(filepath, 'r', encoding='utf-8') as f:

    for line in f:

    yield json.loads(line.strip())

    # Verwendung

    for record in read_jsonl('data.jsonl'):

    print(record)

    Streaming mit JSONL

    import json
    

    from typing import Iterator, Dict

    class JSONLProcessor:

    def __init__(self, input_path: str, output_path: str):

    self.input_path = input_path

    self.output_path = output_path

    def process_stream(self, transform_fn):

    """Verarbeite JSONL im Streaming-Modus"""

    with open(self.input_path, 'r') as infile, \

    open(self.output_path, 'w') as outfile:

    for line in infile:

    record = json.loads(line.strip())

    transformed = transform_fn(record)

    outfile.write(json.dumps(transformed) + '\n')

    def filter_stream(self, filter_fn) -> Iterator[Dict]:

    """Filtere JSONL im Streaming-Modus"""

    with open(self.input_path, 'r') as f:

    for line in f:

    record = json.loads(line.strip())

    if filter_fn(record):

    yield record

    # Beispiel

    processor = JSONLProcessor('input.jsonl', 'output.jsonl')

    # Transformation anwenden

    def add_processed_flag(record):

    record['processed'] = True

    return record

    processor.process_stream(add_processed_flag)

    # Filtern

    filtered = processor.filter_stream(lambda r: r.get('label') == 1)

    for record in filtered:

    print(record)

    Best Practices

    1. Effiziente Serialisierung

    import json
    

    import orjson # Schnellere Alternative

    # Standard JSON

    data = {"large": "dataset" * 1000}

    json_str = json.dumps(data)

    # orjson (schneller)

    orjson_bytes = orjson.dumps(data)

    orjson_str = orjson_bytes.decode('utf-8')

    2. Schema-Validierung

    from jsonschema import validate, ValidationError
    
    

    schema = {

    "type": "object",

    "properties": {

    "model_name": {"type": "string"},

    "accuracy": {"type": "number", "minimum": 0, "maximum": 1},

    "epochs": {"type": "integer", "minimum": 1}

    },

    "required": ["model_name", "accuracy"]

    }

    data = {

    "model_name": "ResNet50",

    "accuracy": 0.95,

    "epochs": 100

    }

    try:

    validate(instance=data, schema=schema)

    print("Validierung erfolgreich")

    except ValidationError as e:

    print(f"Validierungsfehler: {e.message}")

    3. Kompression für große Dateien

    import json
    

    import gzip

    # Mit Kompression schreiben

    data = {"large": "dataset"}

    with gzip.open('data.json.gz', 'wt', encoding='utf-8') as f:

    json.dump(data, f)

    # Mit Kompression lesen

    with gzip.open('data.json.gz', 'rt', encoding='utf-8') as f:

    loaded_data = json.load(f)

    Zusammenfassung

    JSON ist unverzichtbar in der Data Science für:

    • Konfigurationsmanagement - Hyperparameter und Experiment-Setups
    • Datenaustausch - APIs und Microservices
    • Metadaten - Dataset-Beschreibungen und Annotationen
    • Ergebnisspeicherung - Training-Logs und Metriken
    • Feature Engineering - Feature-Store-Definitionen

    Mit den richtigen Tools und Best Practices ist JSON ein leistungsstarkes Format für Data Science Workflows.

    Share:

    Verwandte Artikel

    Read in English