<?php
namespace RSSImporter\Included\Importer\CrawlTargets;

use \RSSImporter\Included\Importer\Crawler;
use \RSSImporter\Included\Importer\CrawlTargetInterface;

class EghtesadGooya extends Crawler implements CrawlTargetInterface
{
    public function loadNews(string $url):array
    {
        $pageData = \RSSImporter\Included\Importer\Crawler::request($url);
        $postData = [];
        if($pageData)
        {
            $dom = new \DOMDocument();

            $dom->validateOnParse = true; //<!-- this first
            @$dom->loadHTML('<?xml encoding="utf-8" ?>'.$pageData);        //'cause 'load' == 'parse
            $dom->preserveWhiteSpace = false;
            /* get the element to be deleted */
            $div=$dom->getElementById('related-news');

            /* delete the node */
            if( $div && $div->nodeType==XML_ELEMENT_NODE ){
                $div->parentNode->removeChild( $div );
            }

            $finder = new \DomXPath($dom);
            $classname="item-header";
            $postData['main_image'] = (@$finder->query("//div[@class='single-featured']/a/img/@src")[0]->value); // @$finder->query("//a[@class='res']/img/@src")[0]->value;
            if(!$postData['main_image'])
                $postData['main_image'] = (@$finder->query("//div[@class='single-featured']/a/img/@data-src")[0]->value); // @$finder->query("//a[@class='res']/img/@src")[0]->value;


            // Title
            $postData['title'] = trim(@$finder->query("//h1[@class='single-post-title']/span")[0]->textContent);
            // Lead
            $postData['introtext'] = trim(@$finder->query("//div[contains(@class, 'single-post-excerpt') and contains(@class, 'post-excerpt-at')]")[0]->textContent); //(@$finder->query("//div[@class='single-post-excerpt']")[0]->textContent);
            // News Body
            $postData['body']     = trim($dom->saveHTML(@$finder->query("//div[contains(@class, 'continue-reading-content') and contains(@class, 'close')]")[0]));

            // EghtesadGooya has extra dirty stuff in their posts! remove it!
            $postData['body']     = preg_replace("/<xml.*?>(.*)?<\/xml>/im","", $postData['body']);
            $postData['body']     = preg_replace("/<xml>(.*)?<\/xml>/im","", $postData['body']);

            // Categories
            $postData['categories'] = $this->extractCategories($finder);
            // Tags
            $postData['tags'] = $this->extractTags($finder);

            if(empty($postData['title']) || empty($postData['body']))
                return [];
        }
        else
            throw new \Exception("Could not crawl data.", 1);
         

        return $postData;
    }

    private function extractCategories(\DomXPath $finder):array
    {
        $dom = new \DOMDocument('1.0');

        $entries = $finder->query('(//ul[@class="bf-breadcrumb-items"])/li');

        $categories = [];
        foreach ($entries as $key => $OL) {
            // First Element is The Home Address, not a category!
            if($key == 0) continue;

            if(isset($OL->getElementsByTagName("a")[0]->textContent)) {
                $categories[] = trim(ltrim(rtrim($OL->getElementsByTagName("a")[0]->textContent)));
            }
            else {
                // Last Element does not contain an <a> tag, and does not need one 
                // since it's not a category but the title of the current news.
            }
        }

        return $categories;
    }

    private function extractTags(\DomXPath $finder):array
    {
        // No #Tags defined on EghtesadGooya.ir news.
        return [];
    }
}
