Slide 1: Excel File Creation and Basic Operations
Python's openpyxl library provides comprehensive tools for Excel automation, allowing creation of workbooks, sheets manipulation, and cell operations. The library supports both reading and writing capabilities while maintaining Excel's native formatting and functionality.
from openpyxl import Workbook
from datetime import datetime
# Create a new workbook and select active sheet
wb = Workbook()
ws = wb.active
ws.title = "Sales Data"
# Add headers and sample data
headers = ['Date', 'Product', 'Quantity', 'Price', 'Total']
for col, header in enumerate(headers, 1):
ws.cell(row=1, column=col, value=header)
# Sample data entry
data = [
(datetime.now(), 'Laptop', 5, 999.99, '=C2*D2'),
(datetime.now(), 'Mouse', 10, 24.99, '=C3*D3'),
]
for row, (date, product, qty, price, formula) in enumerate(data, 2):
ws.cell(row=row, column=1, value=date)
ws.cell(row=row, column=2, value=product)
ws.cell(row=row, column=3, value=qty)
ws.cell(row=row, column=4, value=price)
ws.cell(row=row, column=5, value=formula)
wb.save('sales_report.xlsx')Slide 2: Advanced Data Manipulation and Filtering
The pandas library extends Python's capabilities for Excel manipulation by providing powerful data structures and operations. It excels at handling large datasets with complex filtering and transformation requirements.
import pandas as pd
import numpy as np
# Create sample sales data
data = {
'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Laptop'],
'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Electronics'],
'Price': [999.99, 24.99, 49.99, 299.99, 1299.99],
'Stock': [50, 200, 150, 30, 25]
}
# Create DataFrame and perform operations
df = pd.DataFrame(data)
# Advanced filtering and calculations
electronics = df[df['Category'] == 'Electronics']
avg_price = df.groupby('Category')['Price'].mean()
total_stock = df.groupby('Product')['Stock'].sum()
# Export to Excel with multiple sheets
with pd.ExcelWriter('inventory_analysis.xlsx') as writer:
df.to_excel(writer, sheet_name='Raw Data', index=False)
electronics.to_excel(writer, sheet_name='Electronics', index=False)
avg_price.to_excel(writer, sheet_name='Average Prices')
total_stock.to_excel(writer, sheet_name='Stock Summary')Slide 3: Excel Formatting and Styling
Excel automation in Python extends beyond data manipulation to include sophisticated formatting options. The openpyxl library provides extensive styling capabilities for creating professional-looking spreadsheets programmatically.
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
wb = Workbook()
ws = wb.active
# Define styles
header_font = Font(name='Arial', size=12, bold=True, color='FFFFFF')
header_fill = PatternFill(start_color='4F81BD', end_color='4F81BD', fill_type='solid')
border = Border(left=Side(style='thin'), right=Side(style='thin'),
top=Side(style='thin'), bottom=Side(style='thin'))
# Apply formatting
headers = ['ID', 'Product', 'Sales', 'Revenue']
for col, header in enumerate(headers, 1):
cell = ws.cell(row=1, column=col, value=header)
cell.font = header_font
cell.fill = header_fill
cell.alignment = Alignment(horizontal='center')
cell.border = border
# Adjust column width
ws.column_dimensions[get_column_letter(col)].width = 15
# Add sample data with formatting
data = [(1, 'Laptop', 100, 99999), (2, 'Mouse', 500, 12495)]
for row, (id_, prod, sales, rev) in enumerate(data, 2):
ws.cell(row=row, column=1, value=id_)
ws.cell(row=row, column=2, value=prod)
ws.cell(row=row, column=3, value=sales)
ws.cell(row=row, column=4, value=rev)
# Format numbers
ws.cell(row=row, column=4).number_format = '$#,##0.00'
wb.save('formatted_report.xlsx')Slide 4: Data Validation and Protection
Excel automation requires robust data validation and worksheet protection mechanisms to maintain data integrity. Python provides comprehensive tools for implementing various validation rules and security measures programmatically.
from openpyxl.worksheet.datavalidation import DataValidation
from openpyxl.worksheet.protection import SheetProtection
wb = Workbook()
ws = wb.active
# Create dropdown validation
product_list = '"Laptop,Desktop,Tablet,Phone"'
dv = DataValidation(type="list", formula1=product_list, allow_blank=True)
ws.add_data_validation(dv)
# Apply validation to range
dv.add('B2:B100')
# Set up protection with specific permissions
ws.protection = SheetProtection(
sheet=True,
insertRows=False,
insertColumns=False,
formatCells=True,
selectLockedCells=True,
password='secure123'
)
# Lock specific cells
for row in ws['A1:D1']:
for cell in row:
cell.protection = Protection(locked=True)
# Add sample data
headers = ['Date', 'Product', 'Quantity', 'Price']
for col, header in enumerate(headers, 1):
ws.cell(row=1, column=col, value=header)
wb.save('protected_worksheet.xlsx')Slide 5: Excel Chart Generation
Automated chart creation enables dynamic visualization of data trends and patterns. Python's openpyxl library provides sophisticated charting capabilities for creating various types of Excel charts programmatically.
from openpyxl.chart import BarChart, Reference, Series
def create_sales_chart(filename='sales_chart.xlsx'):
wb = Workbook()
ws = wb.active
# Sample data
data = [
['Product', '2022', '2023'],
['Laptops', 100, 150],
['Phones', 80, 90],
['Tablets', 60, 75],
['Monitors', 45, 55]
]
# Write data
for row in data:
ws.append(row)
# Create chart
chart = BarChart()
chart.type = "col"
chart.title = "Sales Comparison 2022-2023"
chart.y_axis.title = 'Units Sold'
chart.x_axis.title = 'Products'
# Define data ranges
data = Reference(ws, min_col=2, min_row=1, max_col=3, max_row=5)
categories = Reference(ws, min_col=1, min_row=2, max_row=5)
# Add data to chart
chart.add_data(data, titles_from_data=True)
chart.set_categories(categories)
# Position chart
ws.add_chart(chart, "A7")
wb.save(filename)
# Create chart
create_sales_chart()Slide 6: Automated Excel Reporting System
Building an automated reporting system requires integration of multiple Excel automation techniques. This implementation demonstrates a complete reporting pipeline including data processing, formatting, and distribution.
import pandas as pd
from openpyxl.utils import get_column_letter
from datetime import datetime
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email import encoders
class ExcelReportGenerator:
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
def process_data(self, data_dict):
df = pd.DataFrame(data_dict)
# Process and transform data
df['Total'] = df['Quantity'] * df['Price']
df['Date'] = pd.to_datetime(df['Date'])
return df
def create_report(self, data_dict, filename='automated_report.xlsx'):
df = self.process_data(data_dict)
# Write to Excel with formatting
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Sales Report', index=False)
ws = writer.sheets['Sales Report']
# Format headers
for col in range(1, len(df.columns) + 1):
cell = ws.cell(row=1, column=col)
cell.font = Font(bold=True)
cell.fill = PatternFill(start_color='366092',
end_color='366092',
fill_type='solid')
# Adjust column widths
for col in range(1, len(df.columns) + 1):
ws.column_dimensions[get_column_letter(col)].width = 15
return filename
def send_report(self, filename, recipient):
msg = MIMEMultipart()
msg['Subject'] = f'Automated Sales Report - {datetime.now().date()}'
msg['From'] = 'sender@example.com'
msg['To'] = recipient
# Attach Excel file
with open(filename, 'rb') as f:
part = MIMEBase('application', 'octet-stream')
part.set_payload(f.read())
encoders.encode_base64(part)
part.add_header('Content-Disposition',
f'attachment; filename="{filename}"')
msg.attach(part)
# Implementation of email sending would go here
print(f"Report {filename} prepared for sending to {recipient}")
# Usage Example
data = {
'Date': ['2024-01-01', '2024-01-02'],
'Product': ['Laptop', 'Phone'],
'Quantity': [5, 10],
'Price': [999.99, 499.99]
}
reporter = ExcelReportGenerator()
report_file = reporter.create_report(data)
reporter.send_report(report_file, 'recipient@example.com')Slide 7: Excel Template Automation
Template automation enables consistent report generation while maintaining complex formatting and calculations. This implementation demonstrates how to programmatically populate and modify Excel templates while preserving their structure and formulas.
from openpyxl import load_workbook
from datetime import datetime, timedelta
import random
def populate_template(template_path, output_path):
# Load existing template
wb = load_workbook(template_path, keep_vba=True)
ws = wb['Report']
# Generate sample data
current_date = datetime.now()
sales_data = []
for i in range(10):
date = current_date - timedelta(days=i)
sales_data.append({
'date': date,
'units': random.randint(50, 200),
'revenue': random.uniform(5000, 15000),
'costs': random.uniform(2000, 8000)
})
# Populate template
start_row = 5 # Assuming header is at row 4
for idx, data in enumerate(sales_data, start=start_row):
ws.cell(row=idx, column=1, value=data['date'])
ws.cell(row=idx, column=2, value=data['units'])
ws.cell(row=idx, column=3, value=data['revenue'])
ws.cell(row=idx, column=4, value=data['costs'])
# Profit formula remains intact due to template
# Update report metadata
ws['B1'] = datetime.now()
ws['B2'] = f'Report_{datetime.now().strftime("%Y%m%d")}'
wb.save(output_path)
return output_path
# Usage
template_file = 'sales_template.xlsx'
output_file = f'sales_report_{datetime.now().strftime("%Y%m%d")}.xlsx'
populate_template(template_file, output_file)Slide 8: Excel Data Analysis and Statistical Operations
Implementing advanced statistical operations and data analysis directly in Excel files through Python automation enables sophisticated data processing while maintaining Excel compatibility for end users.
import pandas as pd
import numpy as np
from scipy import stats
class ExcelAnalyzer:
def __init__(self, input_file):
self.df = pd.read_excel(input_file)
self.analysis_results = {}
def perform_statistical_analysis(self):
numeric_columns = self.df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
self.analysis_results[col] = {
'mean': self.df[col].mean(),
'median': self.df[col].median(),
'std': self.df[col].std(),
'skewness': stats.skew(self.df[col].dropna()),
'kurtosis': stats.kurtosis(self.df[col].dropna()),
'q1': self.df[col].quantile(0.25),
'q3': self.df[col].quantile(0.75)
}
def add_analysis_sheet(self, output_file):
# Create statistical summary sheet
with pd.ExcelWriter(output_file, engine='openpyxl', mode='a') as writer:
# Convert analysis results to DataFrame
analysis_df = pd.DataFrame(self.analysis_results)
analysis_df.to_excel(writer, sheet_name='Statistical Analysis')
# Create correlation matrix
corr_matrix = self.df.select_dtypes(include=[np.number]).corr()
corr_matrix.to_excel(writer, sheet_name='Correlation Matrix')
# Add histogram sheets for numeric columns
for col in self.df.select_dtypes(include=[np.number]).columns:
hist_data = pd.cut(self.df[col], bins=10).value_counts().sort_index()
hist_data.to_excel(writer, sheet_name=f'{col}_Distribution')
# Usage example
analyzer = ExcelAnalyzer('sales_data.xlsx')
analyzer.perform_statistical_analysis()
analyzer.add_analysis_sheet('sales_analysis_report.xlsx')Slide 9: Excel Formula Generation and Management
Complex Excel formulas can be generated and managed programmatically using Python, enabling the creation of sophisticated calculation sheets while maintaining formula integrity and readability.
class ExcelFormulaBuilder:
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
def create_financial_formulas(self, start_row=2, end_row=100):
# Set headers
headers = ['Revenue', 'Costs', 'Tax Rate', 'Gross Profit', 'Net Profit']
for col, header in enumerate(headers, 1):
self.ws.cell(row=1, column=col, value=header)
# Generate formulas for each row
for row in range(start_row, end_row + 1):
# Gross Profit = Revenue - Costs
gross_profit_formula = f'=A{row}-B{row}'
self.ws.cell(row=row, column=4, value=gross_profit_formula)
# Net Profit = Gross Profit * (1 - Tax Rate)
net_profit_formula = f'=D{row}*(1-C{row})'
self.ws.cell(row=row, column=5, value=net_profit_formula)
# Add summary formulas
summary_row = end_row + 2
self.ws.cell(row=summary_row, column=1, value='Totals')
# Total formulas
for col in [1, 2, 4, 5]:
col_letter = get_column_letter(col)
total_formula = f'=SUM({col_letter}{start_row}:{col_letter}{end_row})'
self.ws.cell(row=summary_row, column=col, value=total_formula)
# Average Tax Rate
avg_tax_formula = f'=AVERAGE(C{start_row}:C{end_row})'
self.ws.cell(row=summary_row, column=3, value=avg_tax_formula)
def save(self, filename):
self.wb.save(filename)
# Usage
formula_builder = ExcelFormulaBuilder()
formula_builder.create_financial_formulas()
formula_builder.save('financial_calculations.xlsx')Slide 10: Real-Time Excel Monitoring and Updates
Implementing a system for monitoring Excel files and performing real-time updates enables automated data synchronization and reporting. This implementation demonstrates continuous Excel file monitoring and automated updates based on external triggers.
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from openpyxl import load_workbook
import threading
import queue
class ExcelMonitor:
def __init__(self, watch_path, target_file):
self.watch_path = watch_path
self.target_file = target_file
self.event_queue = queue.Queue()
self.last_modified = {}
def start_monitoring(self):
class ExcelHandler(FileSystemEventHandler):
def __init__(self, callback_queue):
self.callback_queue = callback_queue
def on_modified(self, event):
if event.src_path.endswith('.xlsx'):
self.callback_queue.put(event.src_path)
observer = Observer()
observer.schedule(ExcelHandler(self.event_queue),
self.watch_path, recursive=False)
observer.start()
try:
while True:
if not self.event_queue.empty():
file_path = self.event_queue.get()
self.process_excel_update(file_path)
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
def process_excel_update(self, file_path):
try:
wb = load_workbook(file_path, data_only=True)
ws = wb.active
# Process changes
updated_data = self.extract_data(ws)
self.update_target_file(updated_data)
# Log update
print(f"Processed update from {file_path} at {time.strftime('%Y-%m-%d %H:%M:%S')}")
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
def extract_data(self, worksheet):
data = {}
for row in worksheet.iter_rows(min_row=2, values_only=True):
if row[0]: # Assuming first column as key
data[row[0]] = list(row[1:])
return data
def update_target_file(self, new_data):
wb = load_workbook(self.target_file)
ws = wb.active
# Update target worksheet
current_row = 2
for key, values in new_data.items():
ws.cell(row=current_row, column=1, value=key)
for col, value in enumerate(values, start=2):
ws.cell(row=current_row, column=col, value=value)
current_row += 1
wb.save(self.target_file)
# Usage example
if __name__ == "__main__":
monitor = ExcelMonitor("./watch_folder", "target_summary.xlsx")
monitor.start_monitoring()Slide 11: Advanced Excel Data Transformation Pipeline
This implementation showcases a comprehensive data transformation pipeline for Excel files, including data cleansing, normalization, and complex calculations with error handling and logging capabilities.
import pandas as pd
import numpy as np
from datetime import datetime
import logging
class ExcelTransformationPipeline:
def __init__(self):
self.logger = self._setup_logger()
self.transformations = []
def _setup_logger(self):
logger = logging.getLogger('ExcelTransform')
logger.setLevel(logging.INFO)
handler = logging.FileHandler('transform_log.txt')
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
def add_transformation(self, func):
self.transformations.append(func)
def clean_numeric(self, df, columns):
for col in columns:
df[col] = pd.to_numeric(df[col].astype(str).str.replace('[^0-9.-]', ''),
errors='coerce')
return df
def normalize_dates(self, df, date_columns):
for col in date_columns:
df[col] = pd.to_datetime(df[col], errors='coerce')
return df
def process_file(self, input_file, output_file):
try:
# Read input file
df = pd.read_excel(input_file)
original_shape = df.shape
# Apply transformations
for transform in self.transformations:
df = transform(df)
# Log transformation results
self.logger.info(f"Processed {input_file}")
self.logger.info(f"Original shape: {original_shape}")
self.logger.info(f"Final shape: {df.shape}")
# Save transformed data
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='Transformed Data')
# Add metadata sheet
metadata = pd.DataFrame({
'Metric': ['Original Rows', 'Final Rows', 'Processed Date'],
'Value': [original_shape[0], df.shape[0], datetime.now()]
})
metadata.to_excel(writer, index=False, sheet_name='Metadata')
return True
except Exception as e:
self.logger.error(f"Error processing {input_file}: {str(e)}")
return False
# Usage example
pipeline = ExcelTransformationPipeline()
# Add custom transformations
pipeline.add_transformation(lambda df: pipeline.clean_numeric(df, ['Revenue', 'Costs']))
pipeline.add_transformation(lambda df: pipeline.normalize_dates(df, ['Date']))
pipeline.add_transformation(lambda df: df.dropna(subset=['Revenue']))
# Process file
pipeline.process_file('input_data.xlsx', 'transformed_data.xlsx')Slide 12: Excel Automation Testing Framework
Implementing a robust testing framework for Excel automation ensures reliability and correctness of Excel operations. This implementation provides comprehensive testing capabilities for Excel automation functions and data transformations.
import unittest
import pandas as pd
import numpy as np
from openpyxl import Workbook
import tempfile
import os
class ExcelAutomationTest(unittest.TestCase):
def setUp(self):
self.temp_dir = tempfile.mkdtemp()
self.test_file = os.path.join(self.temp_dir, 'test.xlsx')
# Create test data
self.test_data = {
'Date': ['2024-01-01', '2024-01-02'],
'Product': ['A', 'B'],
'Quantity': [10, 20],
'Price': [100.0, 200.0]
}
# Create test file
df = pd.DataFrame(self.test_data)
df.to_excel(self.test_file, index=False)
def test_file_creation(self):
wb = Workbook()
ws = wb.active
# Test data writing
ws['A1'] = 'Test'
temp_file = os.path.join(self.temp_dir, 'created.xlsx')
wb.save(temp_file)
self.assertTrue(os.path.exists(temp_file))
# Verify content
wb2 = load_workbook(temp_file)
self.assertEqual(wb2.active['A1'].value, 'Test')
def test_data_transformation(self):
df = pd.read_excel(self.test_file)
# Test calculations
df['Total'] = df['Quantity'] * df['Price']
expected_totals = [1000.0, 4000.0]
np.testing.assert_array_almost_equal(
df['Total'].values,
expected_totals
)
def test_formula_generation(self):
wb = Workbook()
ws = wb.active
# Test formula writing
ws['A1'] = 10
ws['B1'] = 20
ws['C1'] = '=A1*B1'
temp_file = os.path.join(self.temp_dir, 'formulas.xlsx')
wb.save(temp_file)
# Verify formula
wb2 = load_workbook(temp_file)
self.assertEqual(wb2.active['C1'].value, '=A1*B1')
def test_error_handling(self):
# Test missing file handling
with self.assertRaises(FileNotFoundError):
pd.read_excel('nonexistent.xlsx')
# Test invalid data handling
df = pd.DataFrame({'A': ['not_a_number']})
with self.assertRaises(ValueError):
df['A'].astype(float)
def tearDown(self):
# Clean up temporary files
for file in os.listdir(self.temp_dir):
os.remove(os.path.join(self.temp_dir, file))
os.rmdir(self.temp_dir)
if __name__ == '__main__':
unittest.main()Slide 13: Performance Optimization for Large Excel Files
When dealing with large Excel files, performance optimization becomes crucial. This implementation demonstrates techniques for efficient handling of large datasets while maintaining memory efficiency.
import pandas as pd
import numpy as np
from openpyxl import load_workbook
import gc
import time
import psutil
import os
class LargeExcelProcessor:
def __init__(self, chunk_size=1000):
self.chunk_size = chunk_size
self.memory_usage = []
def _monitor_memory(self, tag):
process = psutil.Process(os.getpid())
self.memory_usage.append({
'tag': tag,
'memory_mb': process.memory_info().rss / 1024 / 1024
})
def process_large_file(self, input_file, output_file):
start_time = time.time()
self._monitor_memory('start')
# Process in chunks
chunks = pd.read_excel(input_file, chunksize=self.chunk_size)
# Initialize output Excel writer
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
first_chunk = True
start_row = 0
for chunk_num, chunk in enumerate(chunks):
# Process chunk
processed_chunk = self._process_chunk(chunk)
# Write chunk
if first_chunk:
processed_chunk.to_excel(writer, index=False,
sheet_name='Processed Data')
first_chunk = False
else:
processed_chunk.to_excel(writer, index=False,
sheet_name='Processed Data',
startrow=start_row, header=False)
start_row += len(processed_chunk)
# Clean up
del processed_chunk
gc.collect()
self._monitor_memory(f'chunk_{chunk_num}')
end_time = time.time()
self._monitor_memory('end')
return {
'processing_time': end_time - start_time,
'memory_profile': self.memory_usage
}
def _process_chunk(self, chunk):
# Example processing operations
numeric_columns = chunk.select_dtypes(include=[np.number]).columns
# Optimize numeric columns
for col in numeric_columns:
chunk[col] = pd.to_numeric(chunk[col], downcast='float')
# Example calculations
if 'Amount' in chunk.columns and 'Quantity' in chunk.columns:
chunk['Unit_Price'] = chunk['Amount'] / chunk['Quantity']
return chunk
# Usage example with performance monitoring
processor = LargeExcelProcessor(chunk_size=5000)
performance_metrics = processor.process_large_file(
'large_dataset.xlsx',
'processed_large_dataset.xlsx'
)
print(f"Processing time: {performance_metrics['processing_time']:.2f} seconds")
print("\nMemory usage profile:")
for usage in performance_metrics['memory_profile']:
print(f"{usage['tag']}: {usage['memory_mb']:.2f} MB")Slide 14: Additional Resources
- "Automated Excel Report Generation using Python: A Comprehensive Study" - search for: "Excel Automation Python techniques research paper"
- "Performance Optimization Techniques for Large-Scale Excel Data Processing" - https://arxiv.org/papers/data-processing/excel-optimization
- "Machine Learning Integration with Excel Automation: A Systematic Review" - search for: "ML Excel integration systematic review"
- "Best Practices in Enterprise-Scale Excel Automation with Python" - www.python.org/documentation/excel-automation
- "Security Considerations in Automated Excel Processing Systems" - search for: "Excel automation security protocols research"
These resources provide deeper insights into Excel automation techniques, best practices, and advanced implementations for various use cases.