Data Analysis Agent

Build an autonomous data analyst that explores datasets, creates visualizations, and discovers insights using AISuite's agentic loops with max_turns.

Autonomous Data Exploration

AI agent that independently analyzes data and generates insights

Key Features

Autonomous Exploration: Uses max_turns to thoroughly analyze data
Auto-Visualization: Creates relevant charts and plots automatically
Statistical Analysis: Performs EDA and statistical tests
Insight Generation: Identifies patterns and anomalies

Implementation

data_analyst.py

import aisuite as ai
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Any

class DataAnalysisAgent:
    """Autonomous data analyst using max_turns"""
    
    def __init__(self):
        self.client = ai.Client()
        self.model = "openai:gpt-4"
        self.current_df = None
        
    def load_data(self, filepath: str) -> str:
        """Load dataset for analysis"""
        self.current_df = pd.read_csv(filepath)
        return f"Loaded dataset with {len(self.current_df)} rows and {len(self.current_df.columns)} columns"
    
    def describe_data(self) -> str:
        """Get statistical summary of the data"""
        if self.current_df is None:
            return "No data loaded"
        return self.current_df.describe().to_string()
    
    def check_missing_values(self) -> str:
        """Check for missing values in dataset"""
        if self.current_df is None:
            return "No data loaded"
        missing = self.current_df.isnull().sum()
        return f"Missing values:\n{missing[missing > 0].to_string()}"
    
    def create_visualization(self, plot_type: str, x_col: str, y_col: str = None) -> str:
        """Create various types of plots"""
        if self.current_df is None:
            return "No data loaded"
        
        plt.figure(figsize=(10, 6))
        
        if plot_type == "histogram":
            self.current_df[x_col].hist(bins=30)
            plt.xlabel(x_col)
            plt.ylabel("Frequency")
            plt.title(f"Distribution of {x_col}")
        elif plot_type == "scatter":
            plt.scatter(self.current_df[x_col], self.current_df[y_col])
            plt.xlabel(x_col)
            plt.ylabel(y_col)
            plt.title(f"{x_col} vs {y_col}")
        elif plot_type == "correlation":
            corr = self.current_df.select_dtypes(include=['number']).corr()
            sns.heatmap(corr, annot=True, cmap='coolwarm')
            plt.title("Correlation Matrix")
        
        plt.savefig(f"{plot_type}_plot.png")
        plt.close()
        return f"Created {plot_type} plot and saved as {plot_type}_plot.png"
    
    def find_insights(self) -> str:
        """Analyze data for key insights"""
        if self.current_df is None:
            return "No data loaded"
        
        insights = []
        
        # Check correlations
        numeric_cols = self.current_df.select_dtypes(include=['number'])
        if len(numeric_cols.columns) > 1:
            corr = numeric_cols.corr()
            high_corr = []
            for i in range(len(corr.columns)):
                for j in range(i+1, len(corr.columns)):
                    if abs(corr.iloc[i, j]) > 0.7:
                        high_corr.append(f"{corr.columns[i]} and {corr.columns[j]}: {corr.iloc[i, j]:.2f}")
            if high_corr:
                insights.append(f"High correlations found: {', '.join(high_corr)}")
        
        # Check outliers
        for col in numeric_cols.columns:
            Q1 = numeric_cols[col].quantile(0.25)
            Q3 = numeric_cols[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers = ((numeric_cols[col] < Q1 - 1.5 * IQR) | (numeric_cols[col] > Q3 + 1.5 * IQR)).sum()
            if outliers > 0:
                insights.append(f"{col} has {outliers} outliers")
        
        return "Key insights:\n" + "\n".join(insights) if insights else "No significant insights found"
    
    def analyze_dataset(self, filepath: str, analysis_goal: str) -> str:
        """Main analysis function using max_turns for autonomous exploration"""
        
        # Define available functions
        functions = [
            self.load_data,
            self.describe_data,
            self.check_missing_values,
            self.create_visualization,
            self.find_insights
        ]
        
        # Initial prompt with analysis goal
        messages = [
            {
                "role": "system",
                "content": """You are a data analyst. Use the available functions to:
                1. Load and explore the dataset
                2. Check data quality (missing values, outliers)
                3. Create relevant visualizations
                4. Find and report insights
                5. Provide recommendations based on the analysis goal"""
            },
            {
                "role": "user",
                "content": f"Analyze the dataset at {filepath}. Goal: {analysis_goal}"
            }
        ]
        
        # Let the agent explore autonomously
        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            functions=functions,
            max_turns=10  # Allow up to 10 function calls for thorough analysis
        )
        
        return response.choices[0].message.content

# Example usage
analyst = DataAnalysisAgent()

# Analyze a sales dataset
result = analyst.analyze_dataset(
    filepath="sales_data.csv",
    analysis_goal="Identify factors driving sales and seasonal patterns"
)

print("📊 ANALYSIS REPORT:")
print(result)

Analysis Workflow

1. Load Dataset → Check shape, columns, data types
2. Data Quality → Missing values, duplicates, outliers
3. Statistical Summary → Mean, median, std, quartiles
4. Visualizations → Distributions, correlations, trends
5. Pattern Detection → Clusters, anomalies, relationships
6. Insight Report → Key findings and recommendations

Use Cases

💰 Sales Analysis

Identify trends, seasonality, and top-performing products

👥 Customer Segmentation

Discover customer groups and behavior patterns

🏭 Quality Control

Detect anomalies and quality issues in production data

✨ Powered by AISuite

▸Agentic Loops: max_turns enables autonomous multi-step analysis
▸Function Orchestration: Agent decides which analysis to perform
▸Natural Language: Describe goals in plain English

Extend It

🤖 Add ML Models

Integrate scikit-learn for predictive modeling and clustering

📈 Real-time Dashboards

Connect to Streamlit or Dash for interactive visualizations