iso-bot/engine/screen/ocr.py

346 lines
No EOL
11 KiB
Python

"""OCR (Optical Character Recognition) for extracting text from screenshots.
Provides text detection and extraction capabilities using pytesseract
with preprocessing for better accuracy in game environments.
"""
from typing import List, Dict, Optional, Tuple, NamedTuple
import logging
import re
import cv2
import numpy as np
import pytesseract
from PIL import Image
logger = logging.getLogger(__name__)
class TextMatch(NamedTuple):
"""Represents detected text with position and confidence."""
text: str
confidence: float
bbox: Tuple[int, int, int, int] # (x, y, width, height)
class OCRConfig:
"""Configuration for OCR processing."""
def __init__(self):
# Tesseract configuration
self.tesseract_config = "--oem 3 --psm 6" # Default config
self.language = "eng"
self.min_confidence = 30.0
# Image preprocessing
self.preprocess = True
self.scale_factor = 2.0
self.denoise = True
self.contrast_enhance = True
# Text filtering
self.min_text_length = 1
self.filter_patterns = [
r'^[a-zA-Z0-9\s\-_:.,/]+$', # Alphanumeric with common punctuation
]
class OCREngine:
"""OCR engine for text extraction from game screenshots."""
def __init__(self, config: Optional[OCRConfig] = None):
"""Initialize OCR engine.
Args:
config: OCR configuration, or None for defaults
"""
self.config = config or OCRConfig()
self._verify_tesseract()
def _verify_tesseract(self) -> None:
"""Verify tesseract installation."""
try:
pytesseract.get_tesseract_version()
logger.info("Tesseract initialized successfully")
except Exception as e:
logger.error(f"Tesseract not found or not working: {e}")
raise RuntimeError("Tesseract OCR is required but not available")
def extract_text(self, image: np.ndarray, region: Optional[Tuple[int, int, int, int]] = None) -> str:
"""Extract all text from image.
Args:
image: Input image as numpy array
region: Optional (x, y, width, height) region to process
Returns:
Extracted text as string
"""
processed_img = self._preprocess_image(image, region)
try:
text = pytesseract.image_to_string(
processed_img,
lang=self.config.language,
config=self.config.tesseract_config
)
return self._clean_text(text)
except Exception as e:
logger.error(f"OCR extraction failed: {e}")
return ""
def find_text(self, image: np.ndarray, search_text: str,
case_sensitive: bool = False) -> List[TextMatch]:
"""Find specific text in image with positions.
Args:
image: Input image as numpy array
search_text: Text to search for
case_sensitive: Whether search should be case sensitive
Returns:
List of TextMatch objects for found text
"""
processed_img = self._preprocess_image(image)
try:
# Get detailed OCR data
data = pytesseract.image_to_data(
processed_img,
lang=self.config.language,
config=self.config.tesseract_config,
output_type=pytesseract.Output.DICT
)
matches = []
search_lower = search_text.lower() if not case_sensitive else search_text
for i in range(len(data['text'])):
text = data['text'][i].strip()
confidence = float(data['conf'][i])
if confidence < self.config.min_confidence:
continue
text_to_match = text.lower() if not case_sensitive else text
if search_lower in text_to_match:
bbox = (
data['left'][i],
data['top'][i],
data['width'][i],
data['height'][i]
)
matches.append(TextMatch(text, confidence, bbox))
return matches
except Exception as e:
logger.error(f"Text search failed: {e}")
return []
def get_text_regions(self, image: np.ndarray) -> List[TextMatch]:
"""Get all text regions with positions and confidence.
Args:
image: Input image as numpy array
Returns:
List of TextMatch objects for all detected text
"""
processed_img = self._preprocess_image(image)
try:
data = pytesseract.image_to_data(
processed_img,
lang=self.config.language,
config=self.config.tesseract_config,
output_type=pytesseract.Output.DICT
)
text_regions = []
for i in range(len(data['text'])):
text = data['text'][i].strip()
confidence = float(data['conf'][i])
if (confidence < self.config.min_confidence or
len(text) < self.config.min_text_length):
continue
if not self._passes_text_filters(text):
continue
bbox = (
data['left'][i],
data['top'][i],
data['width'][i],
data['height'][i]
)
text_regions.append(TextMatch(text, confidence, bbox))
return text_regions
except Exception as e:
logger.error(f"Text region detection failed: {e}")
return []
def _preprocess_image(self, image: np.ndarray,
region: Optional[Tuple[int, int, int, int]] = None) -> Image.Image:
"""Preprocess image for better OCR accuracy.
Args:
image: Input image as numpy array
region: Optional region to extract
Returns:
Preprocessed PIL Image
"""
# Extract region if specified
if region:
x, y, w, h = region
image = image[y:y+h, x:x+w]
if not self.config.preprocess:
return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Scale up for better OCR
if self.config.scale_factor > 1.0:
height, width = gray.shape
new_width = int(width * self.config.scale_factor)
new_height = int(height * self.config.scale_factor)
gray = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
# Denoise
if self.config.denoise:
gray = cv2.fastNlMeansDenoising(gray)
# Enhance contrast
if self.config.contrast_enhance:
# Use CLAHE (Contrast Limited Adaptive Histogram Equalization)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
gray = clahe.apply(gray)
# Convert back to PIL Image
return Image.fromarray(gray)
def _clean_text(self, text: str) -> str:
"""Clean extracted text.
Args:
text: Raw extracted text
Returns:
Cleaned text
"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text.strip())
# Remove common OCR artifacts
text = re.sub(r'[|¦]', 'I', text) # Vertical bars to I
text = re.sub(r'[{}]', '', text) # Remove braces
return text
def _passes_text_filters(self, text: str) -> bool:
"""Check if text passes configured filters.
Args:
text: Text to check
Returns:
True if text passes filters
"""
if not self.config.filter_patterns:
return True
for pattern in self.config.filter_patterns:
if re.match(pattern, text):
return True
return False
class TextDetector:
"""High-level text detection interface."""
def __init__(self, ocr_config: Optional[OCRConfig] = None):
"""Initialize text detector.
Args:
ocr_config: OCR configuration
"""
self.ocr = OCREngine(ocr_config)
self.text_cache: Dict[str, List[TextMatch]] = {}
def contains_text(self, image: np.ndarray, text: str,
case_sensitive: bool = False) -> bool:
"""Check if image contains specific text.
Args:
image: Input image
text: Text to search for
case_sensitive: Case sensitive search
Returns:
True if text found
"""
matches = self.ocr.find_text(image, text, case_sensitive)
return len(matches) > 0
def wait_for_text(self, capture_func, text: str, timeout: float = 10.0,
check_interval: float = 0.5) -> bool:
"""Wait for specific text to appear on screen.
Args:
capture_func: Function that returns screenshot
text: Text to wait for
timeout: Maximum wait time in seconds
check_interval: Time between checks in seconds
Returns:
True if text appeared, False if timeout
"""
import time
start_time = time.time()
while time.time() - start_time < timeout:
image = capture_func()
if self.contains_text(image, text):
return True
time.sleep(check_interval)
return False
def get_ui_text(self, image: np.ndarray) -> Dict[str, str]:
"""Extract common UI text elements.
Args:
image: Input image
Returns:
Dictionary mapping UI elements to text
"""
# This is a placeholder for game-specific UI text extraction
# In practice, this would define regions for health, mana, inventory, etc.
text_regions = self.ocr.get_text_regions(image)
ui_text = {}
for region in text_regions:
# Categorize text based on position or pattern
if "health" in region.text.lower():
ui_text["health"] = region.text
elif "mana" in region.text.lower():
ui_text["mana"] = region.text
# Add more UI element detection
return ui_text