← Back to Blog

JSON in Data Science: Python and Pandas Guide

Complete guide to JSON in data science workflows. Learn to process JSON with Python, Pandas, and integrate into ML pipelines.

Michael Rodriguez13 min readprogramming
M

Michael Rodriguez

API & Security Engineer

Michael is an API engineer and security specialist with over 7 years of experience building RESTful services, data conversion pipelines, and authentication systems. He writes practical guides on JSON Web Tokens, API debugging strategies, data science applications of JSON, and modern AI tooling workflows including MCP and JSON-RPC.

REST APIsJWT & SecurityData ScienceJSON PathMCP / AI ToolingAPI Debugging
13 min read

JSON in Data Science

JSON is ubiquitous in data science for API data, NoSQL databases, and configuration files.

Loading JSON with Pandas

import pandas as pd

# Simple JSON file

df = pd.read_json('data.json')

# JSON Lines format

df = pd.read_json('data.jsonl', lines=True)

# From string

df = pd.read_json(json_string)

Handling Nested JSON

import pandas as pd

# Nested data

data = {

"users": [

{"name": "Alice", "address": {"city": "NYC"}},

{"name": "Bob", "address": {"city": "LA"}}

]

}

# Normalize nested structure

df = pd.json_normalize(data['users'])

# Columns: name, address.city

Loading from APIs

import requests

import pandas as pd

response = requests.get('https://api.example.com/data')

data = response.json()

df = pd.DataFrame(data)

Handling Missing Data

data = [

{"name": "Alice", "age": 30, "email": "alice@example.com"},

{"name": "Bob", "age": 25}, # No email

]

df = pd.DataFrame(data)

df['email'] = df['email'].fillna('unknown')

JSON Lines for Large Files

# Process in chunks

chunks = pd.read_json('large.jsonl', lines=True, chunksize=10000)

for chunk in chunks:

process(chunk)

Streaming with ijson

import ijson

def process_large_json(filename):

with open(filename, 'rb') as f:

for item in ijson.items(f, 'items.item'):

yield item

for record in process_large_json('data.json'):

print(record)

Data Science Workflows

API to Analysis Pipeline

import requests

import pandas as pd

import matplotlib.pyplot as plt

# 1. Fetch data from API

response = requests.get('https://api.github.com/repos/python/cpython/issues?state=closed&per_page=100')

issues = response.json()

# 2. Normalize nested JSON

df = pd.json_normalize(

issues,

meta=['id', 'number', 'title', 'state', 'created_at', 'closed_at',

['user', 'login'], ['user', 'type']],

errors='ignore'

)

# 3. Data cleaning

df['created_at'] = pd.to_datetime(df['created_at'])

df['closed_at'] = pd.to_datetime(df['closed_at'])

df['resolution_time'] = (df['closed_at'] - df['created_at']).dt.total_seconds() / 3600 # hours

# 4. Analysis

avg_resolution = df['resolution_time'].mean()

print(f"Average resolution time: {avg_resolution:.1f} hours")

# 5. Visualization

df['resolution_time'].hist(bins=50)

plt.xlabel('Resolution Time (hours)')

plt.ylabel('Count')

plt.savefig('issue_resolution.png')

Real-World Data Science Scenarios

Social Media Sentiment Analysis

import pandas as pd

from textblob import TextBlob

# Load tweets from JSON

df = pd.read_json('tweets.json', lines=True)

# Extract sentiment

df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

df['sentiment_category'] = pd.cut(

df['sentiment'],

bins=[-1, -0.1, 0.1, 1],

labels=['negative', 'neutral', 'positive']

)

# Analyze by hashtag

sentiment_by_tag = df.groupby('hashtags')['sentiment'].mean().sort_values(ascending=False)

print(sentiment_by_tag.head(10))

E-commerce Analytics

import pandas as pd

import json

# Load orders from Shopify API export

with open('orders.json') as f:

orders = json.load(f)

# Flatten nested order structure

orders_df = pd.json_normalize(

orders,

record_path=['line_items'],

meta=['id', 'order_number', 'created_at', 'total_price',

'financial_status', 'fulfillment_status',

['customer', 'id'], ['customer', 'email'],

['shipping_address', 'country']],

meta_prefix='order_',

record_prefix='item_'

)

# Revenue analysis

orders_df['item_revenue'] = orders_df['item_price'].astype(float) orders_df['item_quantity']

# Top products

top_products = orders_df.groupby('item_name').agg({

'item_quantity': 'sum',

'item_revenue': 'sum'

}).sort_values('item_revenue', ascending=False).head(20)

# Export for stakeholders

top_products.to_excel('top_products_2026.xlsx')

Data Analysis

# Basic analysis

df.info()

df.describe()

df['status'].value_counts()

# Group by

summary = df.groupby('category').agg({

'price': ['mean', 'sum', 'count'],

'rating': 'mean'

})

# Time series analysis

df['date'] = pd.to_datetime(df['timestamp'])

df.set_index('date', inplace=True)

monthly_sales = df.resample('M')['revenue'].sum()

Exporting Results

# To JSON

df.to_json('output.json', orient='records', indent=2)

# To Excel

df.to_excel('output.xlsx', index=False)

# To CSV

df.to_csv('output.csv', index=False)

# To Parquet (more efficient)

df.to_parquet('output.parquet')

ML Pipeline Integration

Feature Engineering from JSON

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler

import json

# Load training data from API

data = pd.json_normalize(training_data_json)

# Feature engineering

df['account_age_days'] = (pd.Timestamp.now() - pd.to_datetime(df['created_at'])).dt.days

df['avg_transaction'] = df['total_spent'] / df['transaction_count']

# Encode categorical variables

df_encoded = pd.get_dummies(df, columns=['country', 'subscription_tier'])

# Split data

X = df_encoded.drop(['churn', 'user_id', 'email'], axis=1)

y = df_encoded['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model

model = RandomForestClassifier(n_estimators=100)

model.fit(X_train, y_train)

# Save model metadata as JSON

metadata = {

"model_type": "RandomForestClassifier",

"features": list(X.columns),

"feature_count": len(X.columns),

"samples_train": len(X_train),

"samples_test": len(X_test),

"accuracy": float(model.score(X_test, y_test)),

"trained_at": pd.Timestamp.now().isoformat(),

"feature_importance": dict(zip(X.columns, model.feature_importances_.tolist()))

}

with open('model_metadata.json', 'w') as f:

json.dump(metadata, f, indent=2)

Production ML Serving

from flask import Flask, request, jsonify

import joblib

import pandas as pd

app = Flask(__name__)

model = joblib.load('model.pkl')

# Load feature metadata

with open('model_metadata.json') as f:

metadata = json.load(f)

@app.route('/predict', methods=['POST'])

def predict():

# Receive JSON input

data = request.json

# Convert to DataFrame

df = pd.json_normalize(data)

# Validate features match training

if list(df.columns) != metadata['features']:

return jsonify({'error': 'Feature mismatch'}), 400

# Predict

predictions = model.predict(df).tolist()

probabilities = model.predict_proba(df).tolist()

# Return JSON response

return jsonify({

'predictions': predictions,

'probabilities': probabilities,

'model_version': metadata['trained_at']

})

Complex Transformations

Nested Arrays with Record Path

import pandas as pd

# Complex nested structure

data = {

"users": [

{

"id": 1,

"name": "Alice",

"orders": [

{"order_id": 101, "items": [{"sku": "A1", "qty": 2}, {"sku": "B2", "qty": 1}]},

{"order_id": 102, "items": [{"sku": "C3", "qty": 5}]}

]

}

]

}

# Extract all items from all orders from all users

df = pd.json_normalize(

data['users'],

record_path=['orders', 'items'],

meta=['id', 'name', ['orders', 'order_id']],

errors='ignore'

)

# Result: id, name, orders.order_id, sku, qty

JSON to Time Series

import pandas as pd

# Server metrics JSON

metrics = pd.read_json('server_metrics.jsonl', lines=True)

metrics['timestamp'] = pd.to_datetime(metrics['timestamp'])

metrics.set_index('timestamp', inplace=True)

# Resample to 5-minute intervals

metrics_5min = metrics.resample('5T').agg({

'cpu_percent': 'mean',

'memory_mb': 'mean',

'requests': 'sum'

})

# Detect anomalies

metrics_5min['cpu_anomaly'] = (

metrics_5min['cpu_percent'] > metrics_5min['cpu_percent'].mean() + 2 metrics_5min['cpu_percent'].std()

)

Big Data with Dask

For datasets larger than RAM:

import dask.dataframe as dd

# Read large JSON files (100GB+)

ddf = dd.read_json('large_dataset/*.json', lines=True)

# Lazy evaluation - only computes when needed

result = ddf.groupby('category')['sales'].sum().compute()

# Parallel processing

ddf['processed'] = ddf['raw_value'].map(expensive_function)

ddf.to_parquet('output/', partition_on='date')

Best Practices

  • Validate JSON before processing with try/except or jsonschema
  • Use UTF-8 encoding to handle international characters
  • Handle missing data appropriately with .fillna() or .dropna()
  • Stream large files using lines=True and chunksize
  • Document data schemas for reproducibility
  • Use pd.json_normalize() for nested structures
  • Convert dates explicitly with pd.to_datetime()
  • Save intermediate results to avoid re-downloading
  • Use Parquet for efficient storage (10x smaller than JSON)
  • Profile memory usage with df.memory_usage(deep=True)
  • Performance Tips

    # Use orient='records' for better performance
    

    df.to_json('output.json', orient='records')

    # Compression for storage

    df.to_json('output.json.gz', compression='gzip') # 80% size reduction

    # Specify dtypes to reduce memory

    df = pd.read_json('data.json', dtype={'id': 'int32', 'value': 'float32'})

    # Use categorical for repeated strings

    df['category'] = df['category'].astype('category') # 90% memory reduction

    Performance Benchmarks

    | Dataset Size | Format | Load Time | Memory |

    |--------------|--------|-----------|--------|

    | 100MB | JSON | 3.2s | 450MB |

    | 100MB | JSON (gzip) | 8.7s | 450MB |

    | 100MB | Parquet | 0.4s | 120MB |

    | 1GB | JSON lines | 35s | 2.1GB |

    | 1GB | Parquet | 2.8s | 680MB |

    Conclusion

    JSON is essential in modern data science workflows for API integration, configuration, and data exchange. Master pd.json_normalize() for nested data, ijson for streaming large files, and Parquet for efficient storage. For production ML pipelines, save model metadata as JSON for reproducibility and version tracking.

    Share:

    Related Articles