"""OCR (Optical Character Recognition) for extracting text from screenshots. Provides text detection and extraction capabilities using pytesseract with preprocessing for better accuracy in game environments. """ from typing import List, Dict, Optional, Tuple, NamedTuple import logging import re import cv2 import numpy as np import pytesseract from PIL import Image logger = logging.getLogger(__name__) class TextMatch(NamedTuple): """Represents detected text with position and confidence.""" text: str confidence: float bbox: Tuple[int, int, int, int] # (x, y, width, height) class OCRConfig: """Configuration for OCR processing.""" def __init__(self): # Tesseract configuration self.tesseract_config = "--oem 3 --psm 6" # Default config self.language = "eng" self.min_confidence = 30.0 # Image preprocessing self.preprocess = True self.scale_factor = 2.0 self.denoise = True self.contrast_enhance = True # Text filtering self.min_text_length = 1 self.filter_patterns = [ r'^[a-zA-Z0-9\s\-_:.,/]+$', # Alphanumeric with common punctuation ] class OCREngine: """OCR engine for text extraction from game screenshots.""" def __init__(self, config: Optional[OCRConfig] = None): """Initialize OCR engine. Args: config: OCR configuration, or None for defaults """ self.config = config or OCRConfig() self._verify_tesseract() def _verify_tesseract(self) -> None: """Verify tesseract installation.""" try: pytesseract.get_tesseract_version() logger.info("Tesseract initialized successfully") except Exception as e: logger.error(f"Tesseract not found or not working: {e}") raise RuntimeError("Tesseract OCR is required but not available") def extract_text(self, image: np.ndarray, region: Optional[Tuple[int, int, int, int]] = None) -> str: """Extract all text from image. Args: image: Input image as numpy array region: Optional (x, y, width, height) region to process Returns: Extracted text as string """ processed_img = self._preprocess_image(image, region) try: text = pytesseract.image_to_string( processed_img, lang=self.config.language, config=self.config.tesseract_config ) return self._clean_text(text) except Exception as e: logger.error(f"OCR extraction failed: {e}") return "" def find_text(self, image: np.ndarray, search_text: str, case_sensitive: bool = False) -> List[TextMatch]: """Find specific text in image with positions. Args: image: Input image as numpy array search_text: Text to search for case_sensitive: Whether search should be case sensitive Returns: List of TextMatch objects for found text """ processed_img = self._preprocess_image(image) try: # Get detailed OCR data data = pytesseract.image_to_data( processed_img, lang=self.config.language, config=self.config.tesseract_config, output_type=pytesseract.Output.DICT ) matches = [] search_lower = search_text.lower() if not case_sensitive else search_text for i in range(len(data['text'])): text = data['text'][i].strip() confidence = float(data['conf'][i]) if confidence < self.config.min_confidence: continue text_to_match = text.lower() if not case_sensitive else text if search_lower in text_to_match: bbox = ( data['left'][i], data['top'][i], data['width'][i], data['height'][i] ) matches.append(TextMatch(text, confidence, bbox)) return matches except Exception as e: logger.error(f"Text search failed: {e}") return [] def get_text_regions(self, image: np.ndarray) -> List[TextMatch]: """Get all text regions with positions and confidence. Args: image: Input image as numpy array Returns: List of TextMatch objects for all detected text """ processed_img = self._preprocess_image(image) try: data = pytesseract.image_to_data( processed_img, lang=self.config.language, config=self.config.tesseract_config, output_type=pytesseract.Output.DICT ) text_regions = [] for i in range(len(data['text'])): text = data['text'][i].strip() confidence = float(data['conf'][i]) if (confidence < self.config.min_confidence or len(text) < self.config.min_text_length): continue if not self._passes_text_filters(text): continue bbox = ( data['left'][i], data['top'][i], data['width'][i], data['height'][i] ) text_regions.append(TextMatch(text, confidence, bbox)) return text_regions except Exception as e: logger.error(f"Text region detection failed: {e}") return [] def _preprocess_image(self, image: np.ndarray, region: Optional[Tuple[int, int, int, int]] = None) -> Image.Image: """Preprocess image for better OCR accuracy. Args: image: Input image as numpy array region: Optional region to extract Returns: Preprocessed PIL Image """ # Extract region if specified if region: x, y, w, h = region image = image[y:y+h, x:x+w] if not self.config.preprocess: return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) # Convert to grayscale gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Scale up for better OCR if self.config.scale_factor > 1.0: height, width = gray.shape new_width = int(width * self.config.scale_factor) new_height = int(height * self.config.scale_factor) gray = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC) # Denoise if self.config.denoise: gray = cv2.fastNlMeansDenoising(gray) # Enhance contrast if self.config.contrast_enhance: # Use CLAHE (Contrast Limited Adaptive Histogram Equalization) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) gray = clahe.apply(gray) # Convert back to PIL Image return Image.fromarray(gray) def _clean_text(self, text: str) -> str: """Clean extracted text. Args: text: Raw extracted text Returns: Cleaned text """ # Remove extra whitespace text = re.sub(r'\s+', ' ', text.strip()) # Remove common OCR artifacts text = re.sub(r'[|¦]', 'I', text) # Vertical bars to I text = re.sub(r'[{}]', '', text) # Remove braces return text def _passes_text_filters(self, text: str) -> bool: """Check if text passes configured filters. Args: text: Text to check Returns: True if text passes filters """ if not self.config.filter_patterns: return True for pattern in self.config.filter_patterns: if re.match(pattern, text): return True return False class TextDetector: """High-level text detection interface.""" def __init__(self, ocr_config: Optional[OCRConfig] = None): """Initialize text detector. Args: ocr_config: OCR configuration """ self.ocr = OCREngine(ocr_config) self.text_cache: Dict[str, List[TextMatch]] = {} def contains_text(self, image: np.ndarray, text: str, case_sensitive: bool = False) -> bool: """Check if image contains specific text. Args: image: Input image text: Text to search for case_sensitive: Case sensitive search Returns: True if text found """ matches = self.ocr.find_text(image, text, case_sensitive) return len(matches) > 0 def wait_for_text(self, capture_func, text: str, timeout: float = 10.0, check_interval: float = 0.5) -> bool: """Wait for specific text to appear on screen. Args: capture_func: Function that returns screenshot text: Text to wait for timeout: Maximum wait time in seconds check_interval: Time between checks in seconds Returns: True if text appeared, False if timeout """ import time start_time = time.time() while time.time() - start_time < timeout: image = capture_func() if self.contains_text(image, text): return True time.sleep(check_interval) return False def get_ui_text(self, image: np.ndarray) -> Dict[str, str]: """Extract common UI text elements. Args: image: Input image Returns: Dictionary mapping UI elements to text """ # This is a placeholder for game-specific UI text extraction # In practice, this would define regions for health, mana, inventory, etc. text_regions = self.ocr.get_text_regions(image) ui_text = {} for region in text_regions: # Categorize text based on position or pattern if "health" in region.text.lower(): ui_text["health"] = region.text elif "mana" in region.text.lower(): ui_text["mana"] = region.text # Add more UI element detection return ui_text