Data Analysis Agent
Build an autonomous data analyst that explores datasets, creates visualizations, and discovers insights using AISuite's agentic loops with max_turns.
Autonomous Data Exploration
AI agent that independently analyzes data and generates insights
Key Features
- Autonomous Exploration: Uses max_turns to thoroughly analyze data
- Auto-Visualization: Creates relevant charts and plots automatically
- Statistical Analysis: Performs EDA and statistical tests
- Insight Generation: Identifies patterns and anomalies
Implementation
data_analyst.py
import aisuite as ai
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Any
class DataAnalysisAgent:
"""Autonomous data analyst using max_turns"""
def __init__(self):
self.client = ai.Client()
self.model = "openai:gpt-4"
self.current_df = None
def load_data(self, filepath: str) -> str:
"""Load dataset for analysis"""
self.current_df = pd.read_csv(filepath)
return f"Loaded dataset with {len(self.current_df)} rows and {len(self.current_df.columns)} columns"
def describe_data(self) -> str:
"""Get statistical summary of the data"""
if self.current_df is None:
return "No data loaded"
return self.current_df.describe().to_string()
def check_missing_values(self) -> str:
"""Check for missing values in dataset"""
if self.current_df is None:
return "No data loaded"
missing = self.current_df.isnull().sum()
return f"Missing values:\n{missing[missing > 0].to_string()}"
def create_visualization(self, plot_type: str, x_col: str, y_col: str = None) -> str:
"""Create various types of plots"""
if self.current_df is None:
return "No data loaded"
plt.figure(figsize=(10, 6))
if plot_type == "histogram":
self.current_df[x_col].hist(bins=30)
plt.xlabel(x_col)
plt.ylabel("Frequency")
plt.title(f"Distribution of {x_col}")
elif plot_type == "scatter":
plt.scatter(self.current_df[x_col], self.current_df[y_col])
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title(f"{x_col} vs {y_col}")
elif plot_type == "correlation":
corr = self.current_df.select_dtypes(include=['number']).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.savefig(f"{plot_type}_plot.png")
plt.close()
return f"Created {plot_type} plot and saved as {plot_type}_plot.png"
def find_insights(self) -> str:
"""Analyze data for key insights"""
if self.current_df is None:
return "No data loaded"
insights = []
# Check correlations
numeric_cols = self.current_df.select_dtypes(include=['number'])
if len(numeric_cols.columns) > 1:
corr = numeric_cols.corr()
high_corr = []
for i in range(len(corr.columns)):
for j in range(i+1, len(corr.columns)):
if abs(corr.iloc[i, j]) > 0.7:
high_corr.append(f"{corr.columns[i]} and {corr.columns[j]}: {corr.iloc[i, j]:.2f}")
if high_corr:
insights.append(f"High correlations found: {', '.join(high_corr)}")
# Check outliers
for col in numeric_cols.columns:
Q1 = numeric_cols[col].quantile(0.25)
Q3 = numeric_cols[col].quantile(0.75)
IQR = Q3 - Q1
outliers = ((numeric_cols[col] < Q1 - 1.5 * IQR) | (numeric_cols[col] > Q3 + 1.5 * IQR)).sum()
if outliers > 0:
insights.append(f"{col} has {outliers} outliers")
return "Key insights:\n" + "\n".join(insights) if insights else "No significant insights found"
def analyze_dataset(self, filepath: str, analysis_goal: str) -> str:
"""Main analysis function using max_turns for autonomous exploration"""
# Define available functions
functions = [
self.load_data,
self.describe_data,
self.check_missing_values,
self.create_visualization,
self.find_insights
]
# Initial prompt with analysis goal
messages = [
{
"role": "system",
"content": """You are a data analyst. Use the available functions to:
1. Load and explore the dataset
2. Check data quality (missing values, outliers)
3. Create relevant visualizations
4. Find and report insights
5. Provide recommendations based on the analysis goal"""
},
{
"role": "user",
"content": f"Analyze the dataset at {filepath}. Goal: {analysis_goal}"
}
]
# Let the agent explore autonomously
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
functions=functions,
max_turns=10 # Allow up to 10 function calls for thorough analysis
)
return response.choices[0].message.content
# Example usage
analyst = DataAnalysisAgent()
# Analyze a sales dataset
result = analyst.analyze_dataset(
filepath="sales_data.csv",
analysis_goal="Identify factors driving sales and seasonal patterns"
)
print("📊 ANALYSIS REPORT:")
print(result)
Analysis Workflow
1. Load Dataset → Check shape, columns, data types 2. Data Quality → Missing values, duplicates, outliers 3. Statistical Summary → Mean, median, std, quartiles 4. Visualizations → Distributions, correlations, trends 5. Pattern Detection → Clusters, anomalies, relationships 6. Insight Report → Key findings and recommendations
Use Cases
💰 Sales Analysis
Identify trends, seasonality, and top-performing products
👥 Customer Segmentation
Discover customer groups and behavior patterns
🏭 Quality Control
Detect anomalies and quality issues in production data
✨ Powered by AISuite
- ▸Agentic Loops: max_turns enables autonomous multi-step analysis
- ▸Function Orchestration: Agent decides which analysis to perform
- ▸Natural Language: Describe goals in plain English
Extend It
🤖 Add ML Models
Integrate scikit-learn for predictive modeling and clustering
📈 Real-time Dashboards
Connect to Streamlit or Dash for interactive visualizations