iso-bot/engine/screen/ocr.py

"""OCR (Optical Character Recognition) for extracting text from screenshots.

Provides text detection and extraction capabilities using pytesseract
with preprocessing for better accuracy in game environments.
"""

from typing import List, Dict, Optional, Tuple, NamedTuple
import logging
import re

import cv2
import numpy as np
import pytesseract
from PIL import Image

logger = logging.getLogger(__name__)


class TextMatch(NamedTuple):
    """Represents detected text with position and confidence."""
    text: str
    confidence: float
    bbox: Tuple[int, int, int, int]  # (x, y, width, height)


class OCRConfig:
    """Configuration for OCR processing."""

    def __init__(self):
        # Tesseract configuration
        self.tesseract_config = "--oem 3 --psm 6"  # Default config
        self.language = "eng"
        self.min_confidence = 30.0

        # Image preprocessing
        self.preprocess = True
        self.scale_factor = 2.0
        self.denoise = True
        self.contrast_enhance = True

        # Text filtering
        self.min_text_length = 1
        self.filter_patterns = [
            r'^[a-zA-Z0-9\s\-_:.,/]+$',  # Alphanumeric with common punctuation
        ]


class OCREngine:
    """OCR engine for text extraction from game screenshots."""

    def __init__(self, config: Optional[OCRConfig] = None):
        """Initialize OCR engine.

        Args:
            config: OCR configuration, or None for defaults
        """
        self.config = config or OCRConfig()
        self._verify_tesseract()

    def _verify_tesseract(self) -> None:
        """Verify tesseract installation."""
        try:
            pytesseract.get_tesseract_version()
            logger.info("Tesseract initialized successfully")
        except Exception as e:
            logger.error(f"Tesseract not found or not working: {e}")
            raise RuntimeError("Tesseract OCR is required but not available")

    def extract_text(self, image: np.ndarray, region: Optional[Tuple[int, int, int, int]] = None) -> str:
        """Extract all text from image.

        Args:
            image: Input image as numpy array
            region: Optional (x, y, width, height) region to process

        Returns:
            Extracted text as string
        """
        processed_img = self._preprocess_image(image, region)

        try:
            text = pytesseract.image_to_string(
                processed_img,
                lang=self.config.language,
                config=self.config.tesseract_config
            )

            return self._clean_text(text)

        except Exception as e:
            logger.error(f"OCR extraction failed: {e}")
            return ""

    def find_text(self, image: np.ndarray, search_text: str,
                  case_sensitive: bool = False) -> List[TextMatch]:
        """Find specific text in image with positions.

        Args:
            image: Input image as numpy array
            search_text: Text to search for
            case_sensitive: Whether search should be case sensitive

        Returns:
            List of TextMatch objects for found text
        """
        processed_img = self._preprocess_image(image)

        try:
            # Get detailed OCR data
            data = pytesseract.image_to_data(
                processed_img,
                lang=self.config.language,
                config=self.config.tesseract_config,
                output_type=pytesseract.Output.DICT
            )

            matches = []
            search_lower = search_text.lower() if not case_sensitive else search_text

            for i in range(len(data['text'])):
                text = data['text'][i].strip()
                confidence = float(data['conf'][i])

                if confidence < self.config.min_confidence:
                    continue

                text_to_match = text.lower() if not case_sensitive else text

                if search_lower in text_to_match:
                    bbox = (
                        data['left'][i],
                        data['top'][i],
                        data['width'][i],
                        data['height'][i]
                    )

                    matches.append(TextMatch(text, confidence, bbox))

            return matches

        except Exception as e:
            logger.error(f"Text search failed: {e}")
            return []

    def get_text_regions(self, image: np.ndarray) -> List[TextMatch]:
        """Get all text regions with positions and confidence.

        Args:
            image: Input image as numpy array

        Returns:
            List of TextMatch objects for all detected text
        """
        processed_img = self._preprocess_image(image)

        try:
            data = pytesseract.image_to_data(
                processed_img,
                lang=self.config.language,
                config=self.config.tesseract_config,
                output_type=pytesseract.Output.DICT
            )

            text_regions = []

            for i in range(len(data['text'])):
                text = data['text'][i].strip()
                confidence = float(data['conf'][i])

                if (confidence < self.config.min_confidence or
                    len(text) < self.config.min_text_length):
                    continue

                if not self._passes_text_filters(text):
                    continue

                bbox = (
                    data['left'][i],
                    data['top'][i],
                    data['width'][i],
                    data['height'][i]
                )

                text_regions.append(TextMatch(text, confidence, bbox))

            return text_regions

        except Exception as e:
            logger.error(f"Text region detection failed: {e}")
            return []

    def _preprocess_image(self, image: np.ndarray,
                         region: Optional[Tuple[int, int, int, int]] = None) -> Image.Image:
        """Preprocess image for better OCR accuracy.

        Args:
            image: Input image as numpy array
            region: Optional region to extract

        Returns:
            Preprocessed PIL Image
        """
        # Extract region if specified
        if region:
            x, y, w, h = region
            image = image[y:y+h, x:x+w]

        if not self.config.preprocess:
            return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Scale up for better OCR
        if self.config.scale_factor > 1.0:
            height, width = gray.shape
            new_width = int(width * self.config.scale_factor)
            new_height = int(height * self.config.scale_factor)
            gray = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)

        # Denoise
        if self.config.denoise:
            gray = cv2.fastNlMeansDenoising(gray)

        # Enhance contrast
        if self.config.contrast_enhance:
            # Use CLAHE (Contrast Limited Adaptive Histogram Equalization)
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
            gray = clahe.apply(gray)

        # Convert back to PIL Image
        return Image.fromarray(gray)

    def _clean_text(self, text: str) -> str:
        """Clean extracted text.

        Args:
            text: Raw extracted text

        Returns:
            Cleaned text
        """
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text.strip())

        # Remove common OCR artifacts
        text = re.sub(r'[|¦]', 'I', text)  # Vertical bars to I
        text = re.sub(r'[{}]', '', text)   # Remove braces

        return text

    def _passes_text_filters(self, text: str) -> bool:
        """Check if text passes configured filters.

        Args:
            text: Text to check

        Returns:
            True if text passes filters
        """
        if not self.config.filter_patterns:
            return True

        for pattern in self.config.filter_patterns:
            if re.match(pattern, text):
                return True

        return False


class TextDetector:
    """High-level text detection interface."""

    def __init__(self, ocr_config: Optional[OCRConfig] = None):
        """Initialize text detector.

        Args:
            ocr_config: OCR configuration
        """
        self.ocr = OCREngine(ocr_config)
        self.text_cache: Dict[str, List[TextMatch]] = {}

    def contains_text(self, image: np.ndarray, text: str,
                     case_sensitive: bool = False) -> bool:
        """Check if image contains specific text.

        Args:
            image: Input image
            text: Text to search for
            case_sensitive: Case sensitive search

        Returns:
            True if text found
        """
        matches = self.ocr.find_text(image, text, case_sensitive)
        return len(matches) > 0

    def wait_for_text(self, capture_func, text: str, timeout: float = 10.0,
                     check_interval: float = 0.5) -> bool:
        """Wait for specific text to appear on screen.

        Args:
            capture_func: Function that returns screenshot
            text: Text to wait for
            timeout: Maximum wait time in seconds
            check_interval: Time between checks in seconds

        Returns:
            True if text appeared, False if timeout
        """
        import time

        start_time = time.time()

        while time.time() - start_time < timeout:
            image = capture_func()
            if self.contains_text(image, text):
                return True

            time.sleep(check_interval)

        return False

    def get_ui_text(self, image: np.ndarray) -> Dict[str, str]:
        """Extract common UI text elements.

        Args:
            image: Input image

        Returns:
            Dictionary mapping UI elements to text
        """
        # This is a placeholder for game-specific UI text extraction
        # In practice, this would define regions for health, mana, inventory, etc.
        text_regions = self.ocr.get_text_regions(image)

        ui_text = {}
        for region in text_regions:
            # Categorize text based on position or pattern
            if "health" in region.text.lower():
                ui_text["health"] = region.text
            elif "mana" in region.text.lower():
                ui_text["mana"] = region.text
            # Add more UI element detection

        return ui_text