# activity_categorizer.py
"""
Activity Categorizer - Categorizes activities based on window titles and applications
Modified to categorize activities per user requirements:
- Browser: Only YouTube, searches, and Gmail
- Productive: All coding tools, AI assistants, development resources
"""
import re
from typing import Dict, List, Optional, Tuple

class ActivityCategorizer:
    """Categorizes activities into Productive, Browser, and Server categories"""
    
    def __init__(self):
        # Define patterns for each category
        self.productive_patterns = [
            # IDEs and Code Editors
            r"visual studio", r"vscode", r"vs code", r"cursor", r"sublime", r"atom",
            r"intellij", r"pycharm", r"webstorm", r"phpstorm", r"rubymine",
            r"eclipse", r"netbeans", r"vim", r"emacs", r"notepad\+\+",
            
            # AI Assistants (moved from browser to productive)
            r"claude", r"chatgpt", r"chat\.openai", r"bard", r"gemini",
            r"copilot", r"perplexity", r"phind", r"you\.com",
            
            # Development Resources (moved from browser to productive)
            r"stackoverflow", r"github\.com", r"gitlab", r"bitbucket",
            r"developer\.mozilla", r"w3schools", r"codecademy",
            r"freecodecamp", r"udemy", r"coursera", r"pluralsight",
            
            # Development Tools
            r"git", r"github desktop", r"sourcetree", r"gitkraken",
            r"docker", r"kubernetes", r"postman", r"insomnia",
            r"datagrip", r"dbeaver", r"mysql workbench", r"pgadmin",
            r"mongodb compass", r"redis", r"terminal", r"cmd", r"powershell",
            r"wsl", r"ubuntu", r"bash", r"zsh",
            
            # File Management & Server Tools
            r"filezilla", r"winscp", r"cyberduck", r"putty", r"ssh",
            r"ftp", r"sftp", r"scp",
            
            # Control Panels (emphasized per user request)
            r"cpanel", r"plesk", r"whm", r"directadmin", r"webmin",
            r"phpmyadmin", r"adminer",
            
            # Documentation Sites (moved from browser to productive)
            r"docs\.", r"documentation", r"api\.", r"reference",
            r"tutorial", r"guide", r"manual", r"wiki",
            
            # Design & Documentation
            r"figma", r"sketch", r"adobe xd", r"photoshop", r"illustrator",
            r"notion", r"obsidian", r"confluence", r"jira", r"trello",
            r"asana", r"monday", r"clickup", r"linear",
            
            # Communication tools (except Gmail which goes to browser)
            r"slack", r"discord", r"teams", r"zoom", r"meet",
            
            # Project Files
            r"\.js$", r"\.ts$", r"\.py$", r"\.php$", r"\.java$", r"\.cs$",
            r"\.cpp$", r"\.c$", r"\.rb$", r"\.go$", r"\.rs$", r"\.swift$",
            r"\.html$", r"\.css$", r"\.scss$", r"\.json$", r"\.xml$", r"\.yaml$",
            r"\.md$", r"\.txt$", r"\.sql$", r"\.sh$", r"\.bat$",
            
            # Common project folders
            r"timesheet", r"project", r"development", r"workspace",
            r"repos", r"repository", r"src", r"app", r"backend", r"frontend",
            
            # Localhost development
            r"localhost", r"127\.0\.0\.1", r":3000", r":8000", r":5000", r":4200", r":3001"
        ]
        
        # Browser patterns - For searches, mail, and general browsing
        self.browser_patterns = [
            # Search engines
            r"google\.com/search", r"bing\.com/search", r"duckduckgo\.com",
            r"google\.co\.in/search", r"google\.com.*[?&]q=",  # Google search queries
            
            # All email/mail activities (moved from non-work)
            r"gmail\.com", r"mail\.google\.com", 
            r"mail", r"email", r"outlook", r"yahoo mail",
            r"first economy mail", r"protonmail", r"thunderbird",
            r"webmail", r"hotmail", r"aol mail", r"icloud mail",
            r"inbox", r"@firsteconomy\.com", r"@gmail\.com", r"@yahoo\.com",
            r"@outlook\.com", r"@hotmail\.com", r"compose mail", r"new message",
            
            # General web browsing (only if not matching productive patterns)
            r"chrome(?!\.exe)", r"firefox(?!\.exe)", r"edge(?!\.exe)", 
            r"safari(?!\.exe)", r"opera(?!\.exe)", r"brave(?!\.exe)"
        ]
        
        self.server_patterns = [
            # Cloud Providers
            r"aws", r"amazon web services", r"ec2", r"s3", r"lambda",
            r"cloudformation", r"elasticbeanstalk", r"rds", r"dynamodb",
            
            r"gcp", r"google cloud", r"compute engine", r"cloud storage",
            r"cloud functions", r"bigquery", r"firebase",
            
            r"azure", r"microsoft azure", r"azure portal", r"azure devops",
            
            r"digitalocean", r"linode", r"vultr", r"heroku", r"netlify",
            r"vercel", r"cloudflare", r"namecheap", r"godaddy",
            
            # Server Management
            r"rdp", r"remote desktop", r"vnc", r"teamviewer",
            r"anydesk", r"parsec", r"termius", r"openvpn", r"vpn",
            
            # Monitoring & Analytics
            r"datadog", r"new relic", r"grafana", r"prometheus",
            r"elastic", r"kibana", r"splunk", r"sentry",
            
            # CI/CD
            r"jenkins", r"travis", r"circle ?ci", r"gitlab ci", r"github actions",
            r"bitbucket pipelines", r"bamboo", r"teamcity",
            
            # Container & Orchestration
            r"kubernetes", r"k8s", r"rancher", r"openshift",
            r"portainer", r"container", r"pod", r"cluster"
        ]
        
        # Non-work patterns to exclude (YouTube and Untitled)
        self.non_work_patterns = [
            # Untitled windows
            r"^untitled$", r"^untitled\s*-", r"^blank$", r"^new tab$",
            
            # YouTube is non-productive
            r"youtube\.com", r"youtube", r"youtu\.be",
            
            # Entertainment
            r"netflix", r"spotify", r"twitch", r"disney",
            r"hulu", r"prime video", r"music", r"video", r"movie",
            r"game", r"steam", r"epic games", r"origin", r"battle\.net",
            
            # System idle states
            r"lock screen", r"locked", r"lockapp", r"screensaver",
            r"idle", r"afk", r"away",
            
            # Social media (except if used for development)
            r"facebook(?!.*developer)", r"twitter(?!.*api)", r"instagram", 
            r"tiktok", r"snapchat", r"whatsapp"
        ]
        
        # Compatibility alias for old code
        self.non_work_keywords = self.non_work_patterns
    
    def categorize_activity(self, window_title: str, app_name: str = "") -> Tuple[str, float]:
        """
        Categorize an activity based on window title and application name
        Returns: (category, confidence_score)
        """
        # Combine title and app for analysis
        combined_text = f"{window_title} {app_name}".lower()
        
        # Check if it's non-work activity first (including Untitled)
        if self._matches_patterns(combined_text, self.non_work_patterns):
            return "non-work", 0.9
        
        # Check for browser patterns next (including all mail)
        if self._matches_patterns(combined_text, self.browser_patterns):
            return "browser", 0.9
        
        # Check productive patterns (after browser check)
        if self._matches_patterns(combined_text, self.productive_patterns):
            # Double-check it's not just a browser with mail content
            if "@" in combined_text or "inbox" in combined_text.lower():
                return "browser", 0.9
            # Double-check it's not just a browser with productive content
            if any(browser in app_name.lower() for browser in ["chrome", "firefox", "edge", "safari"]):
                # If it's a browser, check if URL contains productive patterns
                if any(prod_pattern in combined_text for prod_pattern in [
                    "stackoverflow", "github", "claude", "chatgpt", "localhost",
                    "docs.", "api.", "documentation", "tutorial", "cpanel", "phpmyadmin"
                ]):
                    return "productive", 0.9
                # Otherwise, it might be general browsing
                else:
                    return "browser", 0.4
            else:
                return "productive", 0.9
        
        # Check server patterns
        if self._matches_patterns(combined_text, self.server_patterns):
            return "server", 0.8
        
        # Default categorization for browsers
        if any(browser in combined_text for browser in ["chrome", "firefox", "edge", "safari", "opera", "brave"]):
            # If it's a browser but doesn't match our specific browser patterns,
            # it's likely general browsing which should be browser category
            return "browser", 0.5
        
        # Check for file extensions (productive work)
        if re.search(r'\.(py|js|php|java|cs|cpp|html|css|json|xml|sql)', combined_text):
            return "productive", 0.7
            
        # Check for common productivity apps (excluding mail which goes to browser)
        if any(prod_term in combined_text for prod_term in ["calendar", "meeting", "document"]):
            return "productive", 0.4
        
        # Default to browser for general activities
        return "browser", 0.3
    
    def _matches_patterns(self, text: str, patterns: List[str]) -> bool:
        """Check if text matches any of the patterns"""
        for pattern in patterns:
            if re.search(pattern, text, re.IGNORECASE):
                return True
        return False
    
    def _calculate_score(self, text: str, patterns: List[str]) -> float:
        """Calculate matching score for a category"""
        matches = 0
        for pattern in patterns:
            if re.search(pattern, text, re.IGNORECASE):
                matches += 1
        
        # Normalize score (0 to 1)
        return min(matches / 5.0, 1.0)  # Cap at 5 matches for full score
    
    def get_detailed_category(self, window_title: str, app_name: str = "") -> Dict:
        """
        Get detailed categorization with subcategory
        """
        category, confidence = self.categorize_activity(window_title, app_name)
        
        # Determine subcategory
        combined_text = f"{window_title} {app_name}".lower()
        subcategory = "general"
        
        if category == "productive":
            if re.search(r"(cursor|vscode|visual studio|intellij|pycharm)", combined_text):
                subcategory = "coding"
            elif re.search(r"(claude|chatgpt|bard|copilot)", combined_text):
                subcategory = "ai-assistance"
            elif re.search(r"(stackoverflow|github\.com)", combined_text):
                subcategory = "development-research"
            elif re.search(r"(filezilla|winscp|putty|ssh|cpanel|phpmyadmin)", combined_text):
                subcategory = "server-management"
            elif re.search(r"(git|github desktop|sourcetree)", combined_text):
                subcategory = "version-control"
            elif re.search(r"(mysql|postgres|mongodb|redis)", combined_text):
                subcategory = "database"
            elif re.search(r"(figma|photoshop|sketch)", combined_text):
                subcategory = "design"
            elif re.search(r"(notion|confluence|jira)", combined_text):
                subcategory = "documentation"
                
        elif category == "browser":
            if re.search(r"mail|email|gmail|outlook|yahoo mail|first economy mail", combined_text):
                subcategory = "email"
            elif re.search(r"search|[?&]q=", combined_text):
                subcategory = "search"
            else:
                subcategory = "general-browsing"
                
        elif category == "server":
            if re.search(r"(aws|ec2|s3)", combined_text):
                subcategory = "aws"
            elif re.search(r"(gcp|google cloud)", combined_text):
                subcategory = "gcp"
            elif re.search(r"(azure)", combined_text):
                subcategory = "azure"
            elif re.search(r"(kubernetes|docker)", combined_text):
                subcategory = "containers"
                
        elif category == "non-work":
            if re.search(r"youtube", combined_text):
                subcategory = "entertainment"
            elif re.search(r"(netflix|spotify|twitch|disney)", combined_text):
                subcategory = "entertainment"
            elif re.search(r"(facebook|twitter|instagram|tiktok)", combined_text):
                subcategory = "social-media"
            elif re.search(r"(game|steam|epic)", combined_text):
                subcategory = "gaming"
            else:
                subcategory = "other"
        
        return {
            "category": category,
            "subcategory": subcategory,
            "confidence": confidence,
            "window_title": window_title,
            "app_name": app_name
        }
    
    def categorize_batch(self, activities: List[Dict]) -> List[Dict]:
        """
        Categorize a batch of activities
        """
        categorized = []
        for activity in activities:
            window_title = activity.get('window_title', '')
            app_name = activity.get('application_name', '')
            
            category_info = self.get_detailed_category(window_title, app_name)
            
            # Add category info to activity
            activity['category'] = category_info['category']
            activity['subcategory'] = category_info['subcategory']
            activity['category_confidence'] = category_info['confidence']
            
            categorized.append(activity)
        
        return categorized


# Example usage and tests
if __name__ == "__main__":
    categorizer = ActivityCategorizer()
    
    # Test cases to verify the new categorization
    test_cases = [
        ("YouTube - Google Chrome", "chrome.exe"),
        ("Gmail - Inbox - Google Chrome", "chrome.exe"),
        ("Inbox (11,928) - ankita@firsteconomy.com - First Economy Mail", "chrome.exe"),
        ("Untitled", "Unknown"),
        ("Untitled - Notepad", "notepad.exe"),
        ("claude.ai - Claude", "chrome.exe"),
        ("ChatGPT", "chrome.exe"),
        ("Stack Overflow - Python question", "chrome.exe"),
        ("cPanel - Web Hosting Control Panel", "chrome.exe"),
        ("FileZilla - FTP Client", "filezilla.exe"),
        ("Google Search: python tutorial", "chrome.exe"),
        ("Facebook", "chrome.exe"),
        ("main.py - Visual Studio Code", "code.exe"),
        ("localhost:3000 - React App", "chrome.exe"),
    ]
    
    print("Testing categorization:")
    print("-" * 60)
    for title, app in test_cases:
        category_info = categorizer.get_detailed_category(title, app)
        print(f"Title: {title}")
        print(f"App: {app}")
        print(f"Category: {category_info['category']} ({category_info['subcategory']})")
        print(f"Confidence: {category_info['confidence']}")
        print("-" * 60)