Skip to content

Instantly share code, notes, and snippets.

@vinicius-oa
Created December 10, 2025 13:53
Show Gist options
  • Select an option

  • Save vinicius-oa/c71f1843532b5518cb9002795993e1a6 to your computer and use it in GitHub Desktop.

Select an option

Save vinicius-oa/c71f1843532b5518cb9002795993e1a6 to your computer and use it in GitHub Desktop.
Compare two jsons
import json
import sys
import hashlib
from typing import Any, List, Tuple
def load_json_file(filepath):
"""Load JSON from file"""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def sort_json_recursively(obj):
"""
Recursively sort all arrays and dictionaries in the JSON structure.
For arrays of objects, sort by their JSON representation to ensure consistency.
"""
if isinstance(obj, dict):
# Sort dictionary by keys and recursively sort values
return {k: sort_json_recursively(v) for k, v in sorted(obj.items())}
elif isinstance(obj, list):
# First, recursively sort all items in the list
sorted_list = [sort_json_recursively(item) for item in obj]
# Then sort the list itself by JSON representation
# This ensures arrays of objects are in consistent order
try:
sorted_list = sorted(sorted_list, key=lambda x: json.dumps(x, sort_keys=True))
except (TypeError, KeyError) as e:
# If sorting fails, keep the list as is
pass
return sorted_list
else:
return obj
def normalize_json(data):
"""Normalize JSON by sorting everything"""
sorted_data = sort_json_recursively(data)
return json.dumps(sorted_data, sort_keys=True, separators=(',', ':'))
def hash_json(data):
"""Hash normalized JSON"""
normalized = normalize_json(data)
return hashlib.sha256(normalized.encode()).hexdigest()
def find_differences(obj1, obj2, path="root"):
"""
Recursively find differences between two JSON objects.
Returns a list of difference descriptions.
"""
differences = []
# Both are None
if obj1 is None and obj2 is None:
return differences
# One is None
if obj1 is None or obj2 is None:
differences.append({
'path': path,
'type': 'value_diff',
'value1': obj1,
'value2': obj2
})
return differences
# Different types
if type(obj1) != type(obj2):
differences.append({
'path': path,
'type': 'type_diff',
'type1': type(obj1).__name__,
'type2': type(obj2).__name__,
'value1': obj1,
'value2': obj2
})
return differences
# Both are dictionaries
if isinstance(obj1, dict):
all_keys = set(obj1.keys()) | set(obj2.keys())
for key in sorted(all_keys):
new_path = f"{path}.{key}"
if key not in obj1:
differences.append({
'path': new_path,
'type': 'missing_in_first',
'value2': obj2[key]
})
elif key not in obj2:
differences.append({
'path': new_path,
'type': 'missing_in_second',
'value1': obj1[key]
})
else:
differences.extend(find_differences(obj1[key], obj2[key], new_path))
# Both are lists
elif isinstance(obj1, list):
if len(obj1) != len(obj2):
differences.append({
'path': path,
'type': 'list_length_diff',
'length1': len(obj1),
'length2': len(obj2)
})
# Compare elements up to the shorter length
for i in range(min(len(obj1), len(obj2))):
differences.extend(find_differences(obj1[i], obj2[i], f"{path}[{i}]"))
# Report extra elements
if len(obj1) > len(obj2):
for i in range(len(obj2), len(obj1)):
differences.append({
'path': f"{path}[{i}]",
'type': 'extra_in_first',
'value1': obj1[i]
})
elif len(obj2) > len(obj1):
for i in range(len(obj1), len(obj2)):
differences.append({
'path': f"{path}[{i}]",
'type': 'extra_in_second',
'value2': obj2[i]
})
# Primitive values
else:
if obj1 != obj2:
differences.append({
'path': path,
'type': 'value_diff',
'value1': obj1,
'value2': obj2
})
return differences
def format_value(value, max_length=100):
"""Format a value for display, truncating if necessary"""
if isinstance(value, (dict, list)):
s = json.dumps(value, ensure_ascii=False)
else:
s = str(value)
if len(s) > max_length:
return s[:max_length] + "..."
return s
def print_differences(differences, file1_name, file2_name):
"""Print differences in a readable format"""
if not differences:
return
print(f"\n{'='*80}")
print(f"Found {len(differences)} difference(s):")
print(f"{'='*80}\n")
for i, diff in enumerate(differences, 1):
print(f"Difference #{i}:")
print(f" Location: {diff['path']}")
if diff['type'] == 'value_diff':
print(f" Type: Value differs")
print(f" {file1_name}: {format_value(diff['value1'])}")
print(f" {file2_name}: {format_value(diff['value2'])}")
elif diff['type'] == 'type_diff':
print(f" Type: Data type differs")
print(f" {file1_name}: {diff['type1']} = {format_value(diff['value1'])}")
print(f" {file2_name}: {diff['type2']} = {format_value(diff['value2'])}")
elif diff['type'] == 'missing_in_first':
print(f" Type: Key/element missing in {file1_name}")
print(f" {file2_name}: {format_value(diff['value2'])}")
elif diff['type'] == 'missing_in_second':
print(f" Type: Key/element missing in {file2_name}")
print(f" {file1_name}: {format_value(diff['value1'])}")
elif diff['type'] == 'list_length_diff':
print(f" Type: List length differs")
print(f" {file1_name}: {diff['length1']} elements")
print(f" {file2_name}: {diff['length2']} elements")
elif diff['type'] == 'extra_in_first':
print(f" Type: Extra element in {file1_name}")
print(f" Value: {format_value(diff['value1'])}")
elif diff['type'] == 'extra_in_second':
print(f" Type: Extra element in {file2_name}")
print(f" Value: {format_value(diff['value2'])}")
print()
def compare_json_files(file1, file2, save_normalized=False, show_diff=True):
"""Compare two JSON files"""
print(f"Loading {file1}...")
data1 = load_json_file(file1)
print(f"Loading {file2}...")
data2 = load_json_file(file2)
if save_normalized:
print("\nSaving normalized versions...")
normalized1 = sort_json_recursively(data1)
normalized2 = sort_json_recursively(data2)
with open(f"{file1}.normalized.json", 'w', encoding='utf-8') as f:
json.dump(normalized1, f, indent=2, sort_keys=True, ensure_ascii=False)
with open(f"{file2}.normalized.json", 'w', encoding='utf-8') as f:
json.dump(normalized2, f, indent=2, sort_keys=True, ensure_ascii=False)
print(f"Saved: {file1}.normalized.json")
print(f"Saved: {file2}.normalized.json")
print("\nπŸ’‘ Tip: You can diff these normalized files with a tool like 'diff' or 'meld':")
print(f" diff {file1}.normalized.json {file2}.normalized.json")
# Calculate hashes
hash1 = hash_json(data1)
hash2 = hash_json(data2)
print(f"\nFile 1 hash: {hash1}")
print(f"File 2 hash: {hash2}")
# Compare
if hash1 == hash2:
print("\nβœ… JSON files are identical (after normalization)!")
return True
else:
print("\n❌ JSON files differ!")
if show_diff:
# Find and display differences on normalized data
print("\nπŸ” Analyzing differences (on normalized/sorted data)...")
sorted1 = sort_json_recursively(data1)
sorted2 = sort_json_recursively(data2)
differences = find_differences(sorted1, sorted2)
if differences:
print_differences(differences, file1, file2)
else:
print("⚠️ No structural differences found after normalization.")
print(" (This shouldn't happen if hashes differ - possible hash collision)")
return False
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python compare_json.py <file1.json> <file2.json> [--save-normalized] [--no-diff]")
print("\nOptions:")
print(" --save-normalized Save normalized versions of both files")
print(" --no-diff Don't show detailed differences")
sys.exit(1)
file1 = sys.argv[1]
file2 = sys.argv[2]
save_normalized = "--save-normalized" in sys.argv
show_diff = "--no-diff" not in sys.argv
try:
compare_json_files(file1, file2, save_normalized, show_diff)
except FileNotFoundError as e:
print(f"❌ Error: {e}")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"❌ Error: Invalid JSON - {e}")
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment