Write agentic apps once, run with any LLM provider

Data Analysis Agent

Build an autonomous data analyst that explores datasets, creates visualizations, and discovers insights using AISuite's agentic loops with max_turns.

Autonomous Data Exploration

AI agent that independently analyzes data and generates insights

Key Features

  • Autonomous Exploration: Uses max_turns to thoroughly analyze data
  • Auto-Visualization: Creates relevant charts and plots automatically
  • Statistical Analysis: Performs EDA and statistical tests
  • Insight Generation: Identifies patterns and anomalies

Implementation

data_analyst.py
import aisuite as ai
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Any

class DataAnalysisAgent:
    """Autonomous data analyst using max_turns"""
    
    def __init__(self):
        self.client = ai.Client()
        self.model = "openai:gpt-4"
        self.current_df = None
        
    def load_data(self, filepath: str) -> str:
        """Load dataset for analysis"""
        self.current_df = pd.read_csv(filepath)
        return f"Loaded dataset with {len(self.current_df)} rows and {len(self.current_df.columns)} columns"
    
    def describe_data(self) -> str:
        """Get statistical summary of the data"""
        if self.current_df is None:
            return "No data loaded"
        return self.current_df.describe().to_string()
    
    def check_missing_values(self) -> str:
        """Check for missing values in dataset"""
        if self.current_df is None:
            return "No data loaded"
        missing = self.current_df.isnull().sum()
        return f"Missing values:\n{missing[missing > 0].to_string()}"
    
    def create_visualization(self, plot_type: str, x_col: str, y_col: str = None) -> str:
        """Create various types of plots"""
        if self.current_df is None:
            return "No data loaded"
        
        plt.figure(figsize=(10, 6))
        
        if plot_type == "histogram":
            self.current_df[x_col].hist(bins=30)
            plt.xlabel(x_col)
            plt.ylabel("Frequency")
            plt.title(f"Distribution of {x_col}")
        elif plot_type == "scatter":
            plt.scatter(self.current_df[x_col], self.current_df[y_col])
            plt.xlabel(x_col)
            plt.ylabel(y_col)
            plt.title(f"{x_col} vs {y_col}")
        elif plot_type == "correlation":
            corr = self.current_df.select_dtypes(include=['number']).corr()
            sns.heatmap(corr, annot=True, cmap='coolwarm')
            plt.title("Correlation Matrix")
        
        plt.savefig(f"{plot_type}_plot.png")
        plt.close()
        return f"Created {plot_type} plot and saved as {plot_type}_plot.png"
    
    def find_insights(self) -> str:
        """Analyze data for key insights"""
        if self.current_df is None:
            return "No data loaded"
        
        insights = []
        
        # Check correlations
        numeric_cols = self.current_df.select_dtypes(include=['number'])
        if len(numeric_cols.columns) > 1:
            corr = numeric_cols.corr()
            high_corr = []
            for i in range(len(corr.columns)):
                for j in range(i+1, len(corr.columns)):
                    if abs(corr.iloc[i, j]) > 0.7:
                        high_corr.append(f"{corr.columns[i]} and {corr.columns[j]}: {corr.iloc[i, j]:.2f}")
            if high_corr:
                insights.append(f"High correlations found: {', '.join(high_corr)}")
        
        # Check outliers
        for col in numeric_cols.columns:
            Q1 = numeric_cols[col].quantile(0.25)
            Q3 = numeric_cols[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers = ((numeric_cols[col] < Q1 - 1.5 * IQR) | (numeric_cols[col] > Q3 + 1.5 * IQR)).sum()
            if outliers > 0:
                insights.append(f"{col} has {outliers} outliers")
        
        return "Key insights:\n" + "\n".join(insights) if insights else "No significant insights found"
    
    def analyze_dataset(self, filepath: str, analysis_goal: str) -> str:
        """Main analysis function using max_turns for autonomous exploration"""
        
        # Define available functions
        functions = [
            self.load_data,
            self.describe_data,
            self.check_missing_values,
            self.create_visualization,
            self.find_insights
        ]
        
        # Initial prompt with analysis goal
        messages = [
            {
                "role": "system",
                "content": """You are a data analyst. Use the available functions to:
                1. Load and explore the dataset
                2. Check data quality (missing values, outliers)
                3. Create relevant visualizations
                4. Find and report insights
                5. Provide recommendations based on the analysis goal"""
            },
            {
                "role": "user",
                "content": f"Analyze the dataset at {filepath}. Goal: {analysis_goal}"
            }
        ]
        
        # Let the agent explore autonomously
        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            functions=functions,
            max_turns=10  # Allow up to 10 function calls for thorough analysis
        )
        
        return response.choices[0].message.content

# Example usage
analyst = DataAnalysisAgent()

# Analyze a sales dataset
result = analyst.analyze_dataset(
    filepath="sales_data.csv",
    analysis_goal="Identify factors driving sales and seasonal patterns"
)

print("📊 ANALYSIS REPORT:")
print(result)

Analysis Workflow

1. Load Dataset → Check shape, columns, data types
2. Data Quality → Missing values, duplicates, outliers
3. Statistical Summary → Mean, median, std, quartiles
4. Visualizations → Distributions, correlations, trends
5. Pattern Detection → Clusters, anomalies, relationships
6. Insight Report → Key findings and recommendations

Use Cases

💰 Sales Analysis

Identify trends, seasonality, and top-performing products

👥 Customer Segmentation

Discover customer groups and behavior patterns

🏭 Quality Control

Detect anomalies and quality issues in production data

✨ Powered by AISuite

  • Agentic Loops: max_turns enables autonomous multi-step analysis
  • Function Orchestration: Agent decides which analysis to perform
  • Natural Language: Describe goals in plain English

Extend It

🤖 Add ML Models

Integrate scikit-learn for predictive modeling and clustering

📈 Real-time Dashboards

Connect to Streamlit or Dash for interactive visualizations