Python’s ecosystem offers powerful tools for metadata management that can transform how we handle data across applications. From data validation to standardized schemas, these libraries provide essential functionality for organizing, validating, and enriching information. I’ve worked extensively with these tools and will share practical implementations to help you integrate them into your projects.
Pydantic: Type-Based Data Validation
Pydantic stands out for its ability to enforce data schemas using Python type hints. It validates data with minimal code while providing clear error messages when validation fails.
I often use Pydantic when working with APIs or processing user input. The library makes it simple to define expected data structures and automatically converts and validates incoming data.
Here’s a basic example of how to define a metadata schema:
from pydantic import BaseModel, Field, validator
from typing import List, Optional
from datetime import datetime
class ContentMetadata(BaseModel):
title: str = Field(..., min_length=1, max_length=100)
description: Optional[str] = Field(None, max_length=500)
tags: List[str] = []
created_at: datetime = Field(default_factory=datetime.now)
version: float = 1.0
@validator('tags')
def check_tags(cls, v):
if len(v) > 10:
raise ValueError('Maximum 10 tags allowed')
return v
# Using the model
try:
metadata = ContentMetadata(
title="Document Title",
description="This is a sample document",
tags=["python", "metadata", "management"]
)
print(metadata.model_dump())
except Exception as e:
print(f"Validation error: {e}")
For more complex scenarios, Pydantic supports nested models, custom validators, and JSON schema generation:
from pydantic import BaseModel, Field, root_validator
from typing import Dict, List, Optional
class Author(BaseModel):
name: str
email: Optional[str] = None
class FileMetadata(BaseModel):
filename: str
file_type: str
size_kb: float
@root_validator
def check_file_properties(cls, values):
if values.get('size_kb', 0) > 10000 and values.get('file_type') not in ['zip', 'tar.gz']:
raise ValueError('Large files should be compressed')
return values
class DocumentMetadata(BaseModel):
title: str
authors: List[Author]
keywords: List[str] = []
file_info: FileMetadata
custom_fields: Dict[str, str] = {}
# Creating a document with nested metadata
doc_metadata = DocumentMetadata(
title="Research Paper",
authors=[
Author(name="Jane Smith", email="[email protected]"),
Author(name="John Doe")
],
keywords=["research", "metadata", "python"],
file_info=FileMetadata(
filename="research_paper.pdf",
file_type="pdf",
size_kb=2540.5
),
custom_fields={"department": "Research", "status": "Draft"}
)
In my projects, I’ve found that Pydantic excels at validating configuration files, API payloads, and database records, ensuring data integrity throughout the application lifecycle.
Marshmallow: Object Serialization Framework
Marshmallow provides a flexible framework for converting complex objects to and from Python primitives. This is particularly useful when working with databases, APIs, or file formats that require serialization.
I prefer Marshmallow when I need fine-grained control over how data is serialized and deserialized:
from marshmallow import Schema, fields, validate, post_load
import datetime
class MetadataSchema(Schema):
id = fields.String()
title = fields.String(required=True, validate=validate.Length(min=1, max=100))
description = fields.String(validate=validate.Length(max=500))
created = fields.DateTime(default=datetime.datetime.now)
modified = fields.DateTime()
version = fields.Float(default=1.0)
tags = fields.List(fields.String(), validate=validate.Length(max=10))
@post_load
def make_metadata(self, data, **kwargs):
return Metadata(**data)
class Metadata:
def __init__(self, id=None, title=None, description=None, created=None,
modified=None, version=1.0, tags=None):
self.id = id
self.title = title
self.description = description
self.created = created or datetime.datetime.now()
self.modified = modified
self.version = version
self.tags = tags or []
# Serialize object to dictionary
metadata_obj = Metadata(
id="doc123",
title="Important Document",
description="Contains critical information",
tags=["important", "critical"]
)
schema = MetadataSchema()
result = schema.dump(metadata_obj)
print("Serialized data:", result)
# Deserialize from JSON/dict to object
json_data = {
"id": "doc456",
"title": "Meeting Notes",
"description": "Notes from team meeting",
"tags": ["meeting", "notes", "team"]
}
loaded_metadata = schema.load(json_data)
print(f"Deserialized object: {loaded_metadata.title}, {loaded_metadata.tags}")
For handling nested data structures, Marshmallow provides powerful tools:
from marshmallow import Schema, fields, validate
class LocationSchema(Schema):
latitude = fields.Float(required=True)
longitude = fields.Float(required=True)
altitude = fields.Float()
class CameraInfoSchema(Schema):
make = fields.String()
model = fields.String()
settings = fields.Dict(keys=fields.String(), values=fields.String())
class ImageMetadataSchema(Schema):
filename = fields.String(required=True)
format = fields.String(validate=validate.OneOf(["JPEG", "PNG", "TIFF", "RAW"]))
dimensions = fields.Tuple((fields.Integer(), fields.Integer()))
size_bytes = fields.Integer()
taken_at = fields.DateTime()
location = fields.Nested(LocationSchema)
camera = fields.Nested(CameraInfoSchema)
# Example usage
image_data = {
"filename": "vacation_photo.jpg",
"format": "JPEG",
"dimensions": (3840, 2160),
"size_bytes": 2540000,
"taken_at": "2023-06-15T14:35:12",
"location": {
"latitude": 37.7749,
"longitude": -122.4194,
"altitude": 12.5
},
"camera": {
"make": "Canon",
"model": "EOS 5D Mark IV",
"settings": {
"aperture": "f/2.8",
"shutter_speed": "1/250",
"iso": "100"
}
}
}
schema = ImageMetadataSchema()
result = schema.load(image_data) # Validates and deserializes
serialized = schema.dump(result) # Serializes back to primitives
Marshmallow has saved me hours when building APIs that need to transform data between different formats, especially when the data structures are complex or evolve over time.
Dublin Core: Standardized Metadata Elements
The Dublin Core standard provides a common set of metadata elements applicable across many domains. Python libraries implementing this standard help maintain consistency in how resources are described.
A practical implementation using the dcxml library:
from dcxml import simpledc
import xml.etree.ElementTree as ET
# Create Dublin Core metadata
dc_metadata = simpledc.makeDCSimple()
dc_metadata.title = "Python Metadata Management"
dc_metadata.creator = "Jane Developer"
dc_metadata.subject = "Python, Metadata, Libraries"
dc_metadata.description = "A comprehensive guide to managing metadata with Python"
dc_metadata.publisher = "Tech Publications"
dc_metadata.contributor = "John Reviewer"
dc_metadata.date = "2023-09-15"
dc_metadata.type = "Text"
dc_metadata.format = "text/html"
dc_metadata.identifier = "DOI:10.1234/example.2023"
dc_metadata.source = "Original research"
dc_metadata.language = "en"
dc_metadata.relation = "https://example.com/related-resource"
dc_metadata.coverage = "Global"
dc_metadata.rights = "CC BY-SA 4.0"
# Convert to XML
xml_string = ET.tostring(dc_metadata, encoding='utf-8').decode()
print(xml_string)
# Parse existing Dublin Core XML
xml_content = '''
<simpledc xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>Research Data</dc:title>
<dc:creator>Research Team</dc:creator>
<dc:date>2023-10-01</dc:date>
<dc:type>Dataset</dc:type>
<dc:identifier>dataset-2023-001</dc:identifier>
</simpledc>
'''
parsed_dc = ET.fromstring(xml_content)
for element in parsed_dc:
tag = element.tag.split('}')[1] # Remove namespace
print(f"{tag}: {element.text}")
For more advanced Dublin Core operations, you can use a custom implementation:
from dataclasses import dataclass, field
from typing import List, Optional
import json
import xml.etree.ElementTree as ET
@dataclass
class DublinCore:
title: str
creator: Optional[List[str]] = field(default_factory=list)
subject: Optional[List[str]] = field(default_factory=list)
description: Optional[str] = None
publisher: Optional[str] = None
contributor: Optional[List[str]] = field(default_factory=list)
date: Optional[str] = None
type: Optional[str] = None
format: Optional[str] = None
identifier: Optional[str] = None
source: Optional[str] = None
language: Optional[str] = None
relation: Optional[List[str]] = field(default_factory=list)
coverage: Optional[str] = None
rights: Optional[str] = None
def to_dict(self):
"""Convert to dictionary, excluding None values"""
return {k: v for k, v in self.__dict__.items() if v is not None}
def to_json(self):
"""Convert to JSON string"""
return json.dumps(self.to_dict(), indent=2)
def to_xml(self):
"""Convert to Dublin Core XML"""
root = ET.Element("metadata")
root.set("xmlns:dc", "http://purl.org/dc/elements/1.1/")
for key, value in self.to_dict().items():
if isinstance(value, list):
for item in value:
elem = ET.SubElement(root, f"dc:{key}")
elem.text = item
else:
elem = ET.SubElement(root, f"dc:{key}")
elem.text = str(value)
return ET.tostring(root, encoding='utf-8').decode()
@classmethod
def from_dict(cls, data):
"""Create DublinCore from dictionary"""
return cls(**data)
@classmethod
def from_json(cls, json_str):
"""Create DublinCore from JSON string"""
return cls.from_dict(json.loads(json_str))
# Example usage
metadata = DublinCore(
title="Climate Data Analysis",
creator=["Climate Research Institute", "Dr. Jane Smith"],
subject=["climate", "data analysis", "research"],
description="Analysis of global climate patterns from 1990-2020",
publisher="Scientific Data Repository",
date="2023-08-15",
type="Dataset",
identifier="doi:10.1234/climate.2023.08",
language="en",
rights="CC BY 4.0"
)
print(metadata.to_json())
print("\nXML representation:")
print(metadata.to_xml())
I’ve implemented Dublin Core metadata in digital archives and document management systems where standardized elements made it easier to exchange information with other systems and maintain consistency in metadata tagging.
Exif: Image Metadata Extraction and Manipulation
The Exif library enables Python applications to read and modify image metadata, providing access to technical details, camera settings, and geolocation information embedded in images.
Here’s how I typically work with Exif data:
from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
import piexif
import io
def extract_exif(image_path):
"""Extract and format EXIF data from an image"""
image = Image.open(image_path)
# Check if image has EXIF data
if not hasattr(image, '_getexif') or image._getexif() is None:
return {"error": "No EXIF data found"}
exif_data = {}
info = image._getexif()
if info:
for tag, value in info.items():
decoded = TAGS.get(tag, tag)
# Special handling for GPS info
if decoded == 'GPSInfo':
gps_data = {}
for gps_tag in value:
sub_decoded = GPSTAGS.get(gps_tag, gps_tag)
gps_data[sub_decoded] = value[gps_tag]
exif_data[decoded] = gps_data
else:
exif_data[decoded] = value
return exif_data
def modify_exif(image_path, output_path, metadata_updates):
"""Modify EXIF data in an image"""
# Load existing EXIF data
exif_dict = piexif.load(image_path)
# Update 0th IFD (main image metadata)
for key, value in metadata_updates.get('0th', {}).items():
exif_dict['0th'][key] = value
# Update Exif IFD (additional metadata)
for key, value in metadata_updates.get('Exif', {}).items():
exif_dict['Exif'][key] = value
# Update GPS IFD
for key, value in metadata_updates.get('GPS', {}).items():
exif_dict['GPS'][key] = value
# Convert to bytes and save
exif_bytes = piexif.dump(exif_dict)
piexif.insert(exif_bytes, image_path, output_path)
return True
# Example usage
if __name__ == "__main__":
# Extract and print EXIF data
image_path = "sample_photo.jpg"
exif_info = extract_exif(image_path)
print("Image EXIF data:")
for key, value in exif_info.items():
print(f"{key}: {value}")
# Modify EXIF data
# See piexif documentation for tag IDs: https://github.com/hMatoba/Piexif
updates = {
'0th': {
piexif.ImageIFD.Copyright: "© 2023 Photography Studio",
piexif.ImageIFD.Artist: "Jane Photographer"
},
'Exif': {
piexif.ExifIFD.UserComment: "Edited with Python Exif tools".encode()
},
'GPS': {
piexif.GPSIFD.GPSLatitudeRef: "N",
piexif.GPSIFD.GPSLatitude: ((37, 1), (46, 1), (30, 1)),
piexif.GPSIFD.GPSLongitudeRef: "W",
piexif.GPSIFD.GPSLongitude: ((122, 1), (25, 1), (10, 1))
}
}
modify_exif(image_path, "modified_photo.jpg", updates)
print("EXIF data modified and saved to modified_photo.jpg")
Here’s a more advanced function for batch processing images and extracting structured metadata:
import os
from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
import piexif
from datetime import datetime
import json
def batch_extract_metadata(directory, output_json=None):
"""
Extract metadata from all images in a directory and optionally save to JSON
Args:
directory: Path to directory containing images
output_json: Optional path to save JSON output
Returns:
Dictionary with metadata for all images
"""
metadata_collection = {}
supported_formats = {'.jpg', '.jpeg', '.tiff', '.png'}
for filename in os.listdir(directory):
file_ext = os.path.splitext(filename)[1].lower()
if file_ext not in supported_formats:
continue
file_path = os.path.join(directory, filename)
try:
# Get basic file information
file_stats = os.stat(file_path)
metadata = {
"filename": filename,
"file_size_kb": file_stats.st_size / 1024,
"last_modified": datetime.fromtimestamp(file_stats.st_mtime).isoformat(),
"exif": {}
}
# Get image properties
with Image.open(file_path) as img:
metadata["dimensions"] = img.size
metadata["format"] = img.format
metadata["mode"] = img.mode
# Extract EXIF if available
if hasattr(img, '_getexif') and img._getexif():
exif = img._getexif()
# Process standard EXIF data
for tag_id, value in exif.items():
tag = TAGS.get(tag_id, tag_id)
# Convert bytes to string where needed
if isinstance(value, bytes):
try:
value = value.decode('utf-8')
except UnicodeDecodeError:
value = str(value)
# Handle special cases
if tag == 'GPSInfo':
gps_data = {}
for gps_tag in value:
sub_decoded = GPSTAGS.get(gps_tag, gps_tag)
gps_data[sub_decoded] = value[gps_tag]
# Format GPS coordinates if available
if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
try:
lat = _convert_to_degrees(gps_data['GPSLatitude'])
lon = _convert_to_degrees(gps_data['GPSLongitude'])
# Apply reference direction
if gps_data.get('GPSLatitudeRef', 'N') == 'S':
lat = -lat
if gps_data.get('GPSLongitudeRef', 'E') == 'W':
lon = -lon
gps_data['position'] = (lat, lon)
except:
pass
metadata["exif"][tag] = gps_data
elif tag == 'DateTime' or tag == 'DateTimeOriginal' or tag == 'DateTimeDigitized':
# Format date strings
metadata["exif"][tag] = value
try:
date_obj = datetime.strptime(value, '%Y:%m:%d %H:%M:%S')
metadata["exif"][f"{tag}_ISO"] = date_obj.isoformat()
except:
pass
else:
metadata["exif"][tag] = value
metadata_collection[filename] = metadata
except Exception as e:
metadata_collection[filename] = {"error": str(e)}
# Save to JSON if requested
if output_json:
class DateTimeEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime):
return obj.isoformat()
return super().default(obj)
with open(output_json, 'w') as f:
json.dump(metadata_collection, f, indent=2, cls=DateTimeEncoder)
return metadata_collection
def _convert_to_degrees(value):
"""Helper function to convert GPS coordinates from the EXIF format to decimal degrees"""
degrees = value[0][0] / value[0][1]
minutes = value[1][0] / value[1][1] / 60
seconds = value[2][0] / value[2][1] / 3600
return degrees + minutes + seconds
# Example usage
if __name__ == "__main__":
photo_dir = "vacation_photos"
metadata = batch_extract_metadata(photo_dir, "photo_metadata.json")
print(f"Extracted metadata from {len(metadata)} images")
In photography applications, I’ve used these tools to organize photos by location, camera settings, and timestamps, making large photo collections much more searchable and valuable.
Python-docx: Document Metadata Management
The python-docx library allows reading and writing metadata in Microsoft Word documents. I’ve used this library to automate document processing workflows and ensure documents have consistent metadata.
Here’s how to work with document metadata:
from docx import Document
from docx.opc.constants import RELATIONSHIP_TYPE
from docx.opc.coreprops import CoreProperties
import datetime
def read_document_metadata(docx_path):
"""Extract metadata from a Word document"""
doc = Document(docx_path)
core_props = doc.core_properties
metadata = {
"title": core_props.title,
"author": core_props.author,
"created": core_props.created,
"modified": core_props.modified,
"last_modified_by": core_props.last_modified_by,
"revision": core_props.revision,
"category": core_props.category,
"comments": core_props.comments,
"keywords": core_props.keywords,
"subject": core_props.subject,
"version": core_props.version,
"language": core_props.language
}
# Get custom properties if any
try:
custom_props = {}
for prop in doc.custom_properties:
custom_props[prop.name] = prop.value
metadata["custom_properties"] = custom_props
except:
metadata["custom_properties"] = {}
return metadata
def update_document_metadata(docx_path, output_path, metadata_updates):
"""Update metadata in a Word document"""
doc = Document(docx_path)
core_props = doc.core_properties
# Update core properties
if "title" in metadata_updates:
core_props.title = metadata_updates["title"]
if "author" in metadata_updates:
core_props.author = metadata_updates["author"]
if "category" in metadata_updates:
core_props.category = metadata_updates["category"]
if "comments" in metadata_updates:
core_props.comments = metadata_updates["comments"]
if "keywords" in metadata_updates:
core_props.keywords = metadata_updates["keywords"]
if "subject" in metadata_updates:
core_props.subject = metadata_updates["subject"]
if "version" in metadata_updates:
core_props.version = metadata_updates["version"]
if "language" in metadata_updates:
core_props.language = metadata_updates["language"]
# Update last modified automatically
core_props.last_modified_by = metadata_updates.get("last_modified_by", "Python Script")
core_props.modified = datetime.datetime.now()
# Update custom properties
custom_props = metadata_updates.get("custom_properties", {})
for name, value in custom_props.items():
try:
# Try to set if exists
doc.custom_properties[name] = value
except:
# Create if doesn't exist
doc.custom_properties.add(name, value)
# Save document with updated metadata
doc.save(output_path)
return True
# Example usage
if __name__ == "__main__":
# Read metadata
document_path = "report.docx"
metadata = read_document_metadata(document_path)
print("Document Metadata:")
for key, value in metadata.items():
print(f"{key}: {value}")
# Update metadata
updates = {
"title": "Quarterly Financial Report",
"author": "Finance Department",
"keywords": "finance, quarterly, report, 2023",
"subject": "Q3 2023 Financial Analysis",
"version": "1.2",
"language": "en-US",
"last_modified_by": "Metadata Management Script",
"custom_properties": {
"department": "Finance",
"status": "Draft",
"review_date": datetime.datetime.now() + datetime.timedelta(days=7),
"confidentiality": "Internal"
}
}
update_document_metadata(document_path, "updated_report.docx", updates)
print("Metadata updated and saved to updated_report.docx")
For batch processing documents and ensuring consistent metadata across a document library:
import os
import glob
from docx import Document
import pandas as pd
import datetime
import json
def batch_process_document_metadata(directory, pattern="*.docx", updates=None, export_csv=None):
"""
Process metadata for multiple documents in a directory
Args:
directory: Directory containing Word documents
pattern: File pattern to match (default: "*.docx")
updates: Dict of metadata updates to apply to all documents
export_csv: Path to export metadata to CSV file
Returns:
DataFrame with metadata for all documents
"""
files = glob.glob(os.path.join(directory, pattern))
metadata_records = []
for file_path in files:
filename = os.path.basename(file_path)
try:
# Read current metadata
doc = Document(file_path)
props = doc.core_properties
record = {
"filename": filename,
"path": file_path,
"title": props.title,
"author": props.author,
"created": props.created,
"modified": props.modified,
"last_modified_by": props.last_modified_by,
"revision": props.revision,
"category": props.category,
"comments": props.comments,
"keywords": props.keywords,
"subject": props.subject
}
# Get custom properties
custom_props = {}
try:
for prop in doc.custom_properties:
custom_props[prop.name] = prop.value
record["custom_properties"] = json.dumps(custom_props)
except:
record["custom_properties"] = "{}"
# Update metadata if requested
if updates:
# Apply updates
if "title" in updates and not props.title:
props.title = updates["title"]
if "category" in updates and not props.category:
props.category = updates["category"]
if "keywords" in updates and not props.keywords:
props.keywords = updates["keywords"]
# Apply custom update rules if provided
if callable(updates.get("custom_function")):
updates["custom_function"](doc, props, filename)
# Save document with updates
doc.save(file_path)
# Update record with new values
record["title"] = props.title
record["category"] = props.category
record["keywords"] = props.keywords
record["modified"] = datetime.datetime.now()
record["last_modified_by"] = "Batch Processor"
metadata_records.append(record)
except Exception as e:
metadata_records.append({
"filename": filename,
"path": file_path,
"error": str(e)
})
# Convert to DataFrame
df = pd.DataFrame(metadata_records)
# Export to CSV if requested
if export_csv:
df.to_csv(export_csv, index=False)
return df
# Example with custom update function
def custom_update_logic(doc, props, filename):
"""Custom logic to update document metadata based on filename or content"""
# Set department based on filename prefix
if filename.startswith("FIN-"):
try:
doc.custom_properties.add("department", "Finance")
except:
doc.custom_properties["department"] = "Finance"
elif filename.startswith("HR-"):
try:
doc.custom_properties.add("department", "Human Resources")
except:
doc.custom_properties["department"] = "Human Resources"
# Set review date for documents without one
try:
if "review_date" not in doc.custom_properties:
# Set review date to 3 months from now
review_date = datetime.datetime.now() + datetime.timedelta(days=90)
doc.custom_properties.add("review_date", review_date)
except:
pass
# Example usage
if __name__ == "__main__":
updates = {
"category": "Company Documents",
"keywords": "internal, documentation",
"custom_function": custom_update_logic
}
results = batch_process_document_metadata(
"company_documents",
updates=updates,
export_csv="document_metadata.csv"
)
print(f"Processed {len(results)} documents")
print(results.head())
This library has been invaluable for me when working on document management systems where maintaining consistent metadata across hundreds or thousands of documents was critical for search and compliance requirements.
These five Python libraries form a powerful toolkit for metadata management across different types of data and file formats. By combining them, I’ve built robust systems that validate, transform, and standardize metadata, ensuring data assets remain discoverable, usable, and well-documented throughout their lifecycle.