346 lines
No EOL
11 KiB
Python
346 lines
No EOL
11 KiB
Python
"""OCR (Optical Character Recognition) for extracting text from screenshots.
|
|
|
|
Provides text detection and extraction capabilities using pytesseract
|
|
with preprocessing for better accuracy in game environments.
|
|
"""
|
|
|
|
from typing import List, Dict, Optional, Tuple, NamedTuple
|
|
import logging
|
|
import re
|
|
|
|
import cv2
|
|
import numpy as np
|
|
import pytesseract
|
|
from PIL import Image
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TextMatch(NamedTuple):
|
|
"""Represents detected text with position and confidence."""
|
|
text: str
|
|
confidence: float
|
|
bbox: Tuple[int, int, int, int] # (x, y, width, height)
|
|
|
|
|
|
class OCRConfig:
|
|
"""Configuration for OCR processing."""
|
|
|
|
def __init__(self):
|
|
# Tesseract configuration
|
|
self.tesseract_config = "--oem 3 --psm 6" # Default config
|
|
self.language = "eng"
|
|
self.min_confidence = 30.0
|
|
|
|
# Image preprocessing
|
|
self.preprocess = True
|
|
self.scale_factor = 2.0
|
|
self.denoise = True
|
|
self.contrast_enhance = True
|
|
|
|
# Text filtering
|
|
self.min_text_length = 1
|
|
self.filter_patterns = [
|
|
r'^[a-zA-Z0-9\s\-_:.,/]+$', # Alphanumeric with common punctuation
|
|
]
|
|
|
|
|
|
class OCREngine:
|
|
"""OCR engine for text extraction from game screenshots."""
|
|
|
|
def __init__(self, config: Optional[OCRConfig] = None):
|
|
"""Initialize OCR engine.
|
|
|
|
Args:
|
|
config: OCR configuration, or None for defaults
|
|
"""
|
|
self.config = config or OCRConfig()
|
|
self._verify_tesseract()
|
|
|
|
def _verify_tesseract(self) -> None:
|
|
"""Verify tesseract installation."""
|
|
try:
|
|
pytesseract.get_tesseract_version()
|
|
logger.info("Tesseract initialized successfully")
|
|
except Exception as e:
|
|
logger.error(f"Tesseract not found or not working: {e}")
|
|
raise RuntimeError("Tesseract OCR is required but not available")
|
|
|
|
def extract_text(self, image: np.ndarray, region: Optional[Tuple[int, int, int, int]] = None) -> str:
|
|
"""Extract all text from image.
|
|
|
|
Args:
|
|
image: Input image as numpy array
|
|
region: Optional (x, y, width, height) region to process
|
|
|
|
Returns:
|
|
Extracted text as string
|
|
"""
|
|
processed_img = self._preprocess_image(image, region)
|
|
|
|
try:
|
|
text = pytesseract.image_to_string(
|
|
processed_img,
|
|
lang=self.config.language,
|
|
config=self.config.tesseract_config
|
|
)
|
|
|
|
return self._clean_text(text)
|
|
|
|
except Exception as e:
|
|
logger.error(f"OCR extraction failed: {e}")
|
|
return ""
|
|
|
|
def find_text(self, image: np.ndarray, search_text: str,
|
|
case_sensitive: bool = False) -> List[TextMatch]:
|
|
"""Find specific text in image with positions.
|
|
|
|
Args:
|
|
image: Input image as numpy array
|
|
search_text: Text to search for
|
|
case_sensitive: Whether search should be case sensitive
|
|
|
|
Returns:
|
|
List of TextMatch objects for found text
|
|
"""
|
|
processed_img = self._preprocess_image(image)
|
|
|
|
try:
|
|
# Get detailed OCR data
|
|
data = pytesseract.image_to_data(
|
|
processed_img,
|
|
lang=self.config.language,
|
|
config=self.config.tesseract_config,
|
|
output_type=pytesseract.Output.DICT
|
|
)
|
|
|
|
matches = []
|
|
search_lower = search_text.lower() if not case_sensitive else search_text
|
|
|
|
for i in range(len(data['text'])):
|
|
text = data['text'][i].strip()
|
|
confidence = float(data['conf'][i])
|
|
|
|
if confidence < self.config.min_confidence:
|
|
continue
|
|
|
|
text_to_match = text.lower() if not case_sensitive else text
|
|
|
|
if search_lower in text_to_match:
|
|
bbox = (
|
|
data['left'][i],
|
|
data['top'][i],
|
|
data['width'][i],
|
|
data['height'][i]
|
|
)
|
|
|
|
matches.append(TextMatch(text, confidence, bbox))
|
|
|
|
return matches
|
|
|
|
except Exception as e:
|
|
logger.error(f"Text search failed: {e}")
|
|
return []
|
|
|
|
def get_text_regions(self, image: np.ndarray) -> List[TextMatch]:
|
|
"""Get all text regions with positions and confidence.
|
|
|
|
Args:
|
|
image: Input image as numpy array
|
|
|
|
Returns:
|
|
List of TextMatch objects for all detected text
|
|
"""
|
|
processed_img = self._preprocess_image(image)
|
|
|
|
try:
|
|
data = pytesseract.image_to_data(
|
|
processed_img,
|
|
lang=self.config.language,
|
|
config=self.config.tesseract_config,
|
|
output_type=pytesseract.Output.DICT
|
|
)
|
|
|
|
text_regions = []
|
|
|
|
for i in range(len(data['text'])):
|
|
text = data['text'][i].strip()
|
|
confidence = float(data['conf'][i])
|
|
|
|
if (confidence < self.config.min_confidence or
|
|
len(text) < self.config.min_text_length):
|
|
continue
|
|
|
|
if not self._passes_text_filters(text):
|
|
continue
|
|
|
|
bbox = (
|
|
data['left'][i],
|
|
data['top'][i],
|
|
data['width'][i],
|
|
data['height'][i]
|
|
)
|
|
|
|
text_regions.append(TextMatch(text, confidence, bbox))
|
|
|
|
return text_regions
|
|
|
|
except Exception as e:
|
|
logger.error(f"Text region detection failed: {e}")
|
|
return []
|
|
|
|
def _preprocess_image(self, image: np.ndarray,
|
|
region: Optional[Tuple[int, int, int, int]] = None) -> Image.Image:
|
|
"""Preprocess image for better OCR accuracy.
|
|
|
|
Args:
|
|
image: Input image as numpy array
|
|
region: Optional region to extract
|
|
|
|
Returns:
|
|
Preprocessed PIL Image
|
|
"""
|
|
# Extract region if specified
|
|
if region:
|
|
x, y, w, h = region
|
|
image = image[y:y+h, x:x+w]
|
|
|
|
if not self.config.preprocess:
|
|
return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
|
|
|
|
# Convert to grayscale
|
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Scale up for better OCR
|
|
if self.config.scale_factor > 1.0:
|
|
height, width = gray.shape
|
|
new_width = int(width * self.config.scale_factor)
|
|
new_height = int(height * self.config.scale_factor)
|
|
gray = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
|
|
|
|
# Denoise
|
|
if self.config.denoise:
|
|
gray = cv2.fastNlMeansDenoising(gray)
|
|
|
|
# Enhance contrast
|
|
if self.config.contrast_enhance:
|
|
# Use CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
|
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
|
|
gray = clahe.apply(gray)
|
|
|
|
# Convert back to PIL Image
|
|
return Image.fromarray(gray)
|
|
|
|
def _clean_text(self, text: str) -> str:
|
|
"""Clean extracted text.
|
|
|
|
Args:
|
|
text: Raw extracted text
|
|
|
|
Returns:
|
|
Cleaned text
|
|
"""
|
|
# Remove extra whitespace
|
|
text = re.sub(r'\s+', ' ', text.strip())
|
|
|
|
# Remove common OCR artifacts
|
|
text = re.sub(r'[|¦]', 'I', text) # Vertical bars to I
|
|
text = re.sub(r'[{}]', '', text) # Remove braces
|
|
|
|
return text
|
|
|
|
def _passes_text_filters(self, text: str) -> bool:
|
|
"""Check if text passes configured filters.
|
|
|
|
Args:
|
|
text: Text to check
|
|
|
|
Returns:
|
|
True if text passes filters
|
|
"""
|
|
if not self.config.filter_patterns:
|
|
return True
|
|
|
|
for pattern in self.config.filter_patterns:
|
|
if re.match(pattern, text):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
class TextDetector:
|
|
"""High-level text detection interface."""
|
|
|
|
def __init__(self, ocr_config: Optional[OCRConfig] = None):
|
|
"""Initialize text detector.
|
|
|
|
Args:
|
|
ocr_config: OCR configuration
|
|
"""
|
|
self.ocr = OCREngine(ocr_config)
|
|
self.text_cache: Dict[str, List[TextMatch]] = {}
|
|
|
|
def contains_text(self, image: np.ndarray, text: str,
|
|
case_sensitive: bool = False) -> bool:
|
|
"""Check if image contains specific text.
|
|
|
|
Args:
|
|
image: Input image
|
|
text: Text to search for
|
|
case_sensitive: Case sensitive search
|
|
|
|
Returns:
|
|
True if text found
|
|
"""
|
|
matches = self.ocr.find_text(image, text, case_sensitive)
|
|
return len(matches) > 0
|
|
|
|
def wait_for_text(self, capture_func, text: str, timeout: float = 10.0,
|
|
check_interval: float = 0.5) -> bool:
|
|
"""Wait for specific text to appear on screen.
|
|
|
|
Args:
|
|
capture_func: Function that returns screenshot
|
|
text: Text to wait for
|
|
timeout: Maximum wait time in seconds
|
|
check_interval: Time between checks in seconds
|
|
|
|
Returns:
|
|
True if text appeared, False if timeout
|
|
"""
|
|
import time
|
|
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < timeout:
|
|
image = capture_func()
|
|
if self.contains_text(image, text):
|
|
return True
|
|
|
|
time.sleep(check_interval)
|
|
|
|
return False
|
|
|
|
def get_ui_text(self, image: np.ndarray) -> Dict[str, str]:
|
|
"""Extract common UI text elements.
|
|
|
|
Args:
|
|
image: Input image
|
|
|
|
Returns:
|
|
Dictionary mapping UI elements to text
|
|
"""
|
|
# This is a placeholder for game-specific UI text extraction
|
|
# In practice, this would define regions for health, mana, inventory, etc.
|
|
text_regions = self.ocr.get_text_regions(image)
|
|
|
|
ui_text = {}
|
|
for region in text_regions:
|
|
# Categorize text based on position or pattern
|
|
if "health" in region.text.lower():
|
|
ui_text["health"] = region.text
|
|
elif "mana" in region.text.lower():
|
|
ui_text["mana"] = region.text
|
|
# Add more UI element detection
|
|
|
|
return ui_text |