<?php
/**
 * AWS Textract Service
 * Handles all AWS Textract API interactions for document analysis
 */

namespace Redact\Classes;

class TextractService
{
    private string $awsKey;
    private string $awsSecret;
    private string $region;
    
    public function __construct(string $awsKey, string $awsSecret, string $region = 'us-east-1')
    {
        $this->awsKey = $awsKey;
        $this->awsSecret = $awsSecret;
        $this->region = $region;
    }
    
    /**
     * Analyze document with Textract (LAYOUT feature)
     *
     * @param string $imageData Binary image data
     * @return array Result array with success status and data
     */
    public function analyzeDocument(string $imageData): array
    {
        $endpoint = "https://textract.{$this->region}.amazonaws.com/";
        $service = 'textract';
        
        $payload = json_encode([
            'Document' => ['Bytes' => base64_encode($imageData)],
            'FeatureTypes' => ['LAYOUT']
        ]);
        
        $headers = $this->createAWSSignature(
            'POST',
            $endpoint,
            $service,
            $payload,
            'Textract.AnalyzeDocument'
        );
        
        return $this->executeCurlRequest($endpoint, $headers, $payload);
    }
    
    /**
     * Analyze document with Textract (TABLES and FORMS features)
     *
     * @param string $imageData Binary image data
     * @param array $featureTypes Feature types to analyze (default: ['TABLES', 'FORMS'])
     * @return array Result array with success status and data
     */
    public function analyzeDocumentTables(string $imageData, array $featureTypes = ['TABLES', 'FORMS']): array
    {
        $endpoint = "https://textract.{$this->region}.amazonaws.com/";
        $service = 'textract';
        
        $payload = json_encode([
            'Document' => ['Bytes' => base64_encode($imageData)],
            'FeatureTypes' => $featureTypes
        ]);
        
        $headers = $this->createAWSSignature(
            'POST',
            $endpoint,
            $service,
            $payload,
            'Textract.AnalyzeDocument'
        );
        
        return $this->executeCurlRequest($endpoint, $headers, $payload);
    }
    
    /**
     * Analyze document with all Textract features (LAYOUT, TABLES, FORMS)
     * This is the recommended method for comprehensive document analysis including
     * PII detection, table extraction, and form data extraction in a single API call
     *
     * @param string $imageData Binary image data
     * @return array Result array with success status and data
     */
    public function analyzeDocumentFull(string $imageData): array
    {
        $endpoint = "https://textract.{$this->region}.amazonaws.com/";
        $service = 'textract';
        
        $payload = json_encode([
            'Document' => ['Bytes' => base64_encode($imageData)],
            'FeatureTypes' => ['LAYOUT', 'TABLES', 'FORMS']
        ]);
        
        $headers = $this->createAWSSignature(
            'POST',
            $endpoint,
            $service,
            $payload,
            'Textract.AnalyzeDocument'
        );
        
        return $this->executeCurlRequest($endpoint, $headers, $payload);
    }
    
    /**
     * Parse table data from Textract response
     *
     * @param array $textractResponse Textract API response
     * @return array Parsed tables with rows and cells
     */
    public function parseTables(array $textractResponse): array
    {
        if (!isset($textractResponse['Blocks'])) {
            return [];
        }
        
        $blocks = $textractResponse['Blocks'];
        $tables = [];
        $blockMap = [];
        
        // Create a map of blocks by ID for quick lookup
        foreach ($blocks as $block) {
            $blockMap[$block['Id']] = $block;
        }
        
        // Find all TABLE blocks
        foreach ($blocks as $block) {
            if ($block['BlockType'] === 'TABLE') {
                $table = [
                    'id' => $block['Id'],
                    'confidence' => $block['Confidence'] ?? 0,
                    'rows' => [],
                    'rowCount' => 0,
                    'columnCount' => 0
                ];
                
                // Get all cells for this table
                if (isset($block['Relationships'])) {
                    foreach ($block['Relationships'] as $relationship) {
                        if ($relationship['Type'] === 'CHILD') {
                            foreach ($relationship['Ids'] as $cellId) {
                                if (isset($blockMap[$cellId]) && $blockMap[$cellId]['BlockType'] === 'CELL') {
                                    $cell = $blockMap[$cellId];
                                    $rowIndex = $cell['RowIndex'] - 1;
                                    $colIndex = $cell['ColumnIndex'] - 1;
                                    
                                    // Initialize row if needed
                                    if (!isset($table['rows'][$rowIndex])) {
                                        $table['rows'][$rowIndex] = [];
                                    }
                                    
                                    // Get cell text
                                    $cellText = '';
                                    if (isset($cell['Relationships'])) {
                                        foreach ($cell['Relationships'] as $cellRel) {
                                            if ($cellRel['Type'] === 'CHILD') {
                                                foreach ($cellRel['Ids'] as $wordId) {
                                                    if (isset($blockMap[$wordId]) && $blockMap[$wordId]['BlockType'] === 'WORD') {
                                                        $text = $this->sanitizeText($blockMap[$wordId]['Text']);
                                                        $cellText .= $text . ' ';
                                                    }
                                                }
                                            }
                                        }
                                    }
                                    
                                    $table['rows'][$rowIndex][$colIndex] = [
                                        'text' => trim($cellText),
                                        'confidence' => $cell['Confidence'] ?? 0,
                                        'rowSpan' => $cell['RowSpan'] ?? 1,
                                        'columnSpan' => $cell['ColumnSpan'] ?? 1,
                                        'isHeader' => isset($cell['EntityTypes']) && in_array('COLUMN_HEADER', $cell['EntityTypes'])
                                    ];
                                    
                                    // Update dimensions
                                    $table['rowCount'] = max($table['rowCount'], $rowIndex + 1);
                                    $table['columnCount'] = max($table['columnCount'], $colIndex + 1);
                                }
                            }
                        }
                    }
                }
                
                // Sort rows by index
                ksort($table['rows']);
                foreach ($table['rows'] as &$row) {
                    ksort($row);
                }
                
                $tables[] = $table;
            }
        }
        
        return $tables;
    }
    
    /**
     * Create AWS Signature Version 4
     *
     * @param string $method HTTP method
     * @param string $endpoint API endpoint
     * @param string $service AWS service name
     * @param string $payload Request payload
     * @param string|null $target API target action
     * @return array Headers array
     */
    private function createAWSSignature(
        string $method,
        string $endpoint,
        string $service,
        string $payload,
        ?string $target = null
    ): array {
        $algorithm = 'AWS4-HMAC-SHA256';
        $amzDate = gmdate('Ymd\THis\Z');
        $dateStamp = gmdate('Ymd');
        
        $urlParts = parse_url($endpoint);
        $host = $urlParts['host'];
        $uri = $urlParts['path'] ?? '/';
        
        $canonicalHeaders = "host:$host\n" . "x-amz-date:$amzDate\n";
        $signedHeaders = 'host;x-amz-date';
        
        $payloadHash = hash('sha256', $payload);
        $canonicalRequest = "$method\n$uri\n\n$canonicalHeaders\n$signedHeaders\n$payloadHash";
        
        $credentialScope = "$dateStamp/{$this->region}/$service/aws4_request";
        $stringToSign = "$algorithm\n$amzDate\n$credentialScope\n" . hash('sha256', $canonicalRequest);
        
        $kDate = hash_hmac('sha256', $dateStamp, "AWS4{$this->awsSecret}", true);
        $kRegion = hash_hmac('sha256', $this->region, $kDate, true);
        $kService = hash_hmac('sha256', $service, $kRegion, true);
        $kSigning = hash_hmac('sha256', 'aws4_request', $kService, true);
        
        $signature = hash_hmac('sha256', $stringToSign, $kSigning);
        
        $authorizationHeader = "$algorithm Credential={$this->awsKey}/$credentialScope, SignedHeaders=$signedHeaders, Signature=$signature";
        
        $headers = [
            'Authorization' => $authorizationHeader,
            'x-amz-date' => $amzDate,
            'Content-Type' => 'application/x-amz-json-1.1'
        ];
        
        if ($target) {
            $headers['x-amz-target'] = $target;
        }
        
        return $headers;
    }
    
    /**
     * Execute cURL request
     *
     * @param string $endpoint API endpoint
     * @param array $headers Request headers
     * @param string $payload Request payload
     * @return array Result with success status and data
     */
    private function executeCurlRequest(string $endpoint, array $headers, string $payload): array
    {
        $ch = curl_init($endpoint);
        
        $curlHeaders = [];
        foreach ($headers as $key => $value) {
            $curlHeaders[] = "$key: $value";
        }
        
        curl_setopt($ch, CURLOPT_HTTPHEADER, $curlHeaders);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_POST, true);
        curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true);
        curl_setopt($ch, CURLOPT_TIMEOUT, 60);
        
        $response = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $curlError = curl_error($ch);
        curl_close($ch);
        
        if ($httpCode == 200) {
            return [
                'success' => true,
                'data' => json_decode($response, true)
            ];
        } else {
            return [
                'success' => false,
                'error' => $response,
                'curl_error' => $curlError,
                'http_code' => $httpCode
            ];
        }
    }
    
    /**
     * Sanitize text to ensure valid UTF-8
     * Prevents JSON encoding errors from malformed UTF-8
     *
     * @param string $text Text to sanitize
     * @return string Sanitized text
     */
    private function sanitizeText(string $text): string
    {
        // Remove any invalid UTF-8 sequences
        $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
        
        // Additional cleanup using iconv (removes truly invalid characters)
        $text = iconv('UTF-8', 'UTF-8//IGNORE', $text);
        
        // Remove null bytes and other control characters (except newlines/tabs)
        $text = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $text);
        
        return $text;
    }
}

