# 데이터 과학에서의 JSON: 실용 가이드와 분석 기법

JSON은 데이터 과학에서 널리 사용되는 데이터 형식입니다. API 데이터, 로그 파일, NoSQL 데이터베이스 등에서 JSON을 다루는 방법을 배워보세요.

왜 데이터 과학에서 JSON을 사용하나요?

JSON의 장점

API 통합: 대부분의 웹 API가 JSON 반환

유연성: 중첩 구조와 다양한 타입 지원

NoSQL: MongoDB 등에서 기본 포맷

로그: 구조화된 로그 형식

교환: 플랫폼 간 데이터 교환

일반적인 사용 사례

소셜 미디어 API 데이터
IoT 센서 데이터
웹 스크래핑 결과
설정 및 메타데이터
ML 모델 입출력

Python과 JSON

JSON 읽기

import json

# 파일에서 읽기
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 문자열에서 파싱
json_string = '{"name": "홍길동", "age": 30}'
data = json.loads(json_string)

print(data['name'])  # "홍길동"

JSON 쓰기

import json

data = {
    "name": "홍길동",
    "age": 30,
    "scores": [95, 87, 92]
}

# 파일로 저장
with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

# 문자열로 변환
json_string = json.dumps(data, indent=2, ensure_ascii=False)
print(json_string)

Pandas로 JSON 다루기

JSON을 DataFrame으로

import pandas as pd

# 파일에서 읽기
df = pd.read_json('data.json')

# 문자열에서
json_string = '''
[
  {"name": "홍길동", "age": 30, "city": "서울"},
  {"name": "김철수", "age": 25, "city": "부산"}
]
'''
df = pd.read_json(json_string)

print(df)
#     name  age city
# 0  홍길동   30   서울
# 1  김철수   25   부산

중첩된 JSON 처리

import pandas as pd
from pandas import json_normalize

# 중첩된 JSON
data = [
    {
        "name": "홍길동",
        "age": 30,
        "address": {
            "city": "서울",
            "zipCode": "12345"
        },
        "scores": [95, 87, 92]
    },
    {
        "name": "김철수",
        "age": 25,
        "address": {
            "city": "부산",
            "zipCode": "54321"
        },
        "scores": [88, 91, 85]
    }
]

# 평탄화
df = json_normalize(data)
print(df)
#     name  age address.city address.zipCode        scores
# 0  홍길동   30           서울           12345  [95, 87, 92]
# 1  김철수   25           부산           54321  [88, 91, 85]

# 배열 처리
df = json_normalize(
    data,
    record_path='scores',
    meta=['name', 'age', ['address', 'city']]
)

DataFrame을 JSON으로

import pandas as pd

df = pd.DataFrame({
    'name': ['홍길동', '김철수'],
    'age': [30, 25],
    'city': ['서울', '부산']
})

# JSON 문자열로
json_string = df.to_json(orient='records', force_ascii=False)
print(json_string)
# [{"name":"홍길동","age":30,"city":"서울"},{"name":"김철수","age":25,"city":"부산"}]

# 파일로 저장
df.to_json('output.json', orient='records', force_ascii=False, indent=2)

# 다양한 orient 옵션
# 'records': [{...}, {...}]
# 'split': {'columns': [...], 'data': [...]}
# 'index': {index: {...}}
# 'columns': {column: {index: value}}
# 'values': [[...], [...]]

API 데이터 분석

REST API에서 데이터 가져오기

import requests
import pandas as pd

# API 호출
response = requests.get('https://api.example.com/users')
data = response.json()

# DataFrame으로 변환
df = pd.DataFrame(data)

# 기본 분석
print(df.describe())
print(df.info())
print(df.head())

페이지네이션 처리

import requests
import pandas as pd

def fetch_all_pages(base_url):
    all_data = []
    page = 1

    while True:
        response = requests.get(f'{base_url}?page={page}')
        data = response.json()

        if not data['items']:
            break

        all_data.extend(data['items'])
        page += 1

        if not data['hasMore']:
            break

    return pd.DataFrame(all_data)

df = fetch_all_pages('https://api.example.com/users')

에러 처리

import requests
import pandas as pd
import time

def safe_api_call(url, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f'시도 {attempt + 1} 실패: {e}')
            if attempt < max_retries - 1:
                time.sleep(2  attempt)  # 지수 백오프

            else:
                raise

    return None

data = safe_api_call('https://api.example.com/data')
if data:
    df = pd.DataFrame(data)

복잡한 JSON 구조 분석

깊게 중첩된 데이터

import pandas as pd from pandas import json_normalize # 복잡한 JSON data = { "company": { "name": "테크회사", "departments": [ { "name": "개발팀", "employees": [ { "name": "홍길동", "role": "개발자", "projects": ["프로젝트A", "프로젝트B"] }, { "name": "김철수", "role": "디자이너", "projects": ["프로젝트C"] } ] } ] } } # 직원 데이터 추출 df = json_normalize( data, record_path=['company', 'departments', 'employees'], meta=[ ['company', 'name'], ['company', 'departments', 'name'] ] )
print(df)

JSON Lines (JSONL) 처리

import pandas as pd # JSONL 파일 읽기 (각 줄이 JSON 객체) df = pd.read_json('data.jsonl', lines=True) # 또는 수동으로 import json data = [] with open('data.jsonl', 'r') as f: for line in f: data.append(json.loads(line)) df = pd.DataFrame(data) # JSONL로 저장
df.to_json('output.jsonl', orient='records', lines=True, force_ascii=False)

데이터 변환과 정제

타입 변환

import pandas as pd df = pd.read_json('data.json') # 날짜 변환 df['date'] = pd.to_datetime(df['date']) # 범주형 변환 df['category'] = df['category'].astype('category') # 숫자 변환 df['value'] = pd.to_numeric(df['value'], errors='coerce')
print(df.dtypes)

결측값 처리

# 결측값 확인 print(df.isnull().sum()) # null을 NaN으로 df = df.where(pd.notnull(df), None) # 결측값 채우기 df['age'].fillna(df['age'].mean(), inplace=True) # 결측값 제거
df.dropna(subset=['email'], inplace=True)

데이터 정규화

from sklearn.preprocessing import MinMaxScaler, StandardScaler # Min-Max 정규화 scaler = MinMaxScaler() df['score_normalized'] = scaler.fit_transform(df[['score']]) # 표준화 scaler = StandardScaler()
df['score_standardized'] = scaler.fit_transform(df[['score']])

시계열 데이터

JSON 시계열 분석

import pandas as pd # 시계열 JSON data = [ {"timestamp": "2026-01-01T00:00:00Z", "value": 100}, {"timestamp": "2026-01-01T01:00:00Z", "value": 105}, {"timestamp": "2026-01-01T02:00:00Z", "value": 98} ] df = pd.DataFrame(data) df['timestamp'] = pd.to_datetime(df['timestamp']) df.set_index('timestamp', inplace=True) # 리샘플링 hourly = df.resample('H').mean() daily = df.resample('D').sum() # 이동 평균 df['moving_avg'] = df['value'].rolling(window=3).mean()
print(df)

IoT 센서 데이터

import pandas as pd import matplotlib.pyplot as plt # IoT JSON 데이터 sensors = [ { "sensor_id": "temp_01", "timestamp": "2026-01-16T10:00:00Z", "temperature": 22.5, "humidity": 45 }, # ... 더 많은 데이터 ] df = pd.DataFrame(sensors) df['timestamp'] = pd.to_datetime(df['timestamp']) # 센서별 그룹화 grouped = df.groupby('sensor_id') # 통계 stats = grouped.agg({ 'temperature': ['mean', 'min', 'max', 'std'], 'humidity': ['mean', 'min', 'max'] }) print(stats) # 시각화 df.set_index('timestamp')['temperature'].plot() plt.title('온도 변화')
plt.show()

대용량 JSON 처리

청크 단위 읽기

import pandas as pd # 대용량 JSONL 파일 chunk_size = 1000 chunks = [] for chunk in pd.read_json('large_data.jsonl', lines=True, chunksize=chunk_size): # 각 청크 처리 processed = chunk[chunk['value'] > 100] chunks.append(processed) # 합치기
df = pd.concat(chunks, ignore_index=True)

메모리 효율적 처리

import json def process_large_json(filename): results = [] with open(filename, 'r') as f: for i, line in enumerate(f): if i % 10000 == 0: print(f'처리 중: {i}줄') item = json.loads(line) # 필요한 데이터만 추출 if item['value'] > 100: results.append({ 'id': item['id'], 'value': item['value'] }) return pd.DataFrame(results)
df = process_large_json('large_data.jsonl')

Dask 사용 (병렬 처리)

import dask.dataframe as dd # Dask로 대용량 JSON 처리 ddf = dd.read_json('large_data.jsonl', lines=True) # 병렬 연산 result = ddf[ddf['value'] > 100].compute() # 집계
stats = ddf.groupby('category')['value'].mean().compute()

머신러닝 파이프라인

피처 추출

import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer # 텍스트 데이터가 포함된 JSON data = [ {"id": 1, "text": "머신러닝을 공부합니다", "label": "tech"}, {"id": 2, "text": "요리를 좋아합니다", "label": "hobby"} ] df = pd.DataFrame(data) # TF-IDF 피처 vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(df['text']) # DataFrame으로 변환 tfidf_df = pd.DataFrame( tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out() )
print(tfidf_df)

모델 학습 데이터

import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier # JSON에서 데이터 로드 df = pd.read_json('training_data.json') # 피처와 타겟 분리 X = df.drop('label', axis=1) y = df['label'] # 학습/테스트 분할 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # 모델 학습 model = RandomForestClassifier() model.fit(X_train, y_train) # 평가 score = model.score(X_test, y_test)
print(f'정확도: {score:.2f}')

모델 결과 저장

import json import numpy as np # 예측 결과 predictions = model.predict(X_test) # JSON으로 저장 results = { "model": "RandomForest", "accuracy": float(score), "predictions": predictions.tolist(), "feature_importance": { feature: float(importance) for feature, importance in zip(X.columns, model.feature_importances_) } } with open('results.json', 'w') as f:
json.dump(results, f, indent=2)

데이터 시각화

JSON 데이터 시각화

import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # JSON 로드 df = pd.read_json('data.json') # 히스토그램 plt.figure(figsize=(10, 6)) df['value'].hist(bins=30) plt.title('값 분포') plt.xlabel('값') plt.ylabel('빈도') plt.show() # 박스 플롯 plt.figure(figsize=(10, 6)) df.boxplot(column='value', by='category') plt.title('카테고리별 값 분포') plt.show() # 상관관계 히트맵 plt.figure(figsize=(12, 8)) sns.heatmap(df.corr(), annot=True, cmap='coolwarm') plt.title('상관관계 히트맵')
plt.show()

Plotly로 인터랙티브 차트

import pandas as pd import plotly.express as px df = pd.read_json('data.json') # 인터랙티브 산점도 fig = px.scatter( df, x='feature1', y='feature2', color='category', size='value', hover_data=['name'] ) fig.show() # 시계열 차트 fig = px.line(df, x='date', y='value', title='시간별 변화')
fig.show()

NoSQL 데이터베이스

MongoDB

from pymongo import MongoClient import pandas as pd # MongoDB 연결 client = MongoClient('mongodb://localhost:27017/') db = client['mydb'] collection = db['users'] # JSON 삽입 user = { "name": "홍길동", "age": 30, "tags": ["python", "data"] } collection.insert_one(user) # 쿼리 및 DataFrame으로 변환 cursor = collection.find({"age": {"$gte": 25}}) df = pd.DataFrame(list(cursor))
print(df)

Elasticsearch

from elasticsearch import Elasticsearch import pandas as pd # Elasticsearch 연결 es = Elasticsearch(['http://localhost:9200']) # 쿼리 query = { "query": { "match": { "category": "tech" } } } result = es.search(index="articles", body=query) # DataFrame으로 hits = result['hits']['hits'] df = pd.DataFrame([hit['_source'] for hit in hits])
print(df)

모범 사례

1. 데이터 검증

import pandas as pd from jsonschema import validate, ValidationError schema = { "type": "array", "items": { "type": "object", "properties": { "name": {"type": "string"}, "age": {"type": "number", "minimum": 0} }, "required": ["name", "age"] } } # 검증 try: validate(instance=data, schema=schema) df = pd.DataFrame(data) except ValidationError as e:
print(f'검증 실패: {e.message}')

2. 에러 처리

import pandas as pd import json def safe_read_json(filename): try: return pd.read_json(filename) except ValueError as e: print(f'JSON 파싱 오류: {e}') # 라인별 로드 시도 try: return pd.read_json(filename, lines=True) except:
return None

3. 성능 최적화

# dtype 지정으로 메모리 절약 df = pd.read_json( 'data.json', dtype={ 'id': 'int32', 'category': 'category', 'value': 'float32' } ) # 필요한 컬럼만 로드 (가능한 경우)
df = df[['id', 'name', 'value']]

결론

JSON은 데이터 과학 워크플로우의 핵심입니다. Pandas, NumPy, scikit-learn과 함께 사용하면 강력한 데이터 분석 파이프라인을 구축할 수 있습니다.

핵심 요약:**

✅ Pandas로 JSON을 DataFrame으로 변환
✅ json_normalize로 중첩 데이터 평탄화
✅ API 데이터 효율적 처리
✅ 대용량 데이터는 청크 단위 처리
✅ 적절한 검증과 에러 처리

지금 바로 JSON Simplify에서 JSON 데이터를 분석해보세요!

Big JSON Team

왜 데이터 과학에서 JSON을 사용하나요?

JSON의 장점

일반적인 사용 사례

Python과 JSON

JSON 읽기

JSON 쓰기

Pandas로 JSON 다루기

JSON을 DataFrame으로

중첩된 JSON 처리

DataFrame을 JSON으로

API 데이터 분석

REST API에서 데이터 가져오기

페이지네이션 처리

에러 처리

복잡한 JSON 구조 분석

깊게 중첩된 데이터

JSON Lines (JSONL) 처리

데이터 변환과 정제

타입 변환

결측값 처리

데이터 정규화

시계열 데이터

JSON 시계열 분석

IoT 센서 데이터

대용량 JSON 처리

청크 단위 읽기

메모리 효율적 처리

Dask 사용 (병렬 처리)

머신러닝 파이프라인

피처 추출

모델 학습 데이터

모델 결과 저장

데이터 시각화

JSON 데이터 시각화

Plotly로 인터랙티브 차트

NoSQL 데이터베이스

MongoDB

Elasticsearch

모범 사례

1. 데이터 검증

2. 에러 처리

3. 성능 최적화

결론

관련 글

Python과 JSON: 완벽한 가이드 2026

JSON을 Excel로 변환: 완벽한 가이드 2026

대용량 JSON 다루기: 성능 최적화와 실용 기법

Read in English