<?php
namespace RSSImporter\Included\Importer;
/**
 *  Crawler Object, manages flow of observing, downloading,
 *  rephrasing and re-posting news from several pre-programmed sources onto your WP.
 * 
 * @package RSSImporter
 * @author Ali Khaleghi <awli.khaleghi@gmail.com>
 */

include_once RSS_IMPORTER_DIR_PATH."Include/Importer/PHPWatermark/Watermark.php";
include_once RSS_IMPORTER_DIR_PATH."Include/WPTables/NewsTable.php";

// Target Interface, all sources must implant this interface
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargetInterface.php";

// these targets must be loaded at once, a loop on glob(CrawlTargets/*.php) include would be nice
// will reprogram these in case I got enough money :)

// irna.ir target
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/Irna.php";

// isna.ir target
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/Isna.php";

// mehrnews.ir target
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/MehrNews.php";

// farsnews.ir target
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/FarsNews.php";

// khabaronline.ir target
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/KhabarOnline.php";

// mashreghnews.ir target
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/MashreghNews.php";

// eghtesadotejarat.ir target
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/EghtesadoTejarat.php";

// EghtesadGooya.ir target
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/EghtesadGooya.php";

//
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/EghtesadNews.php";

//
include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/HonarOnline.php";

include_once RSS_IMPORTER_DIR_PATH."Include/Importer/CrawlTargets/Yjc.php";

//
use \Ajaxray\PHPWatermark\Watermark;

//
class Crawler 
{
    /**
     * Crawler Constructor
     * The things that has to happen at first, goes here :)
     */
    public function __construct(){
        // RSS/Atom Reader
        include_once RSS_IMPORTER_DIR_PATH."Include/Importer/Reader.php";
    }

    // ----------------------------------------------------------

    /**
     * Commit a new CURL Request
     * 
     * @param string $url   URL Address
     * 
     * @return mixed
     */
    public static function request(string $url)
    {
        $url = urldecode($url);

        $_url = parse_url($url);

        $path = urlencode($_url['path']);
        $path = str_replace("%2F", "/", $path);

        $url = $_url['scheme'].'://'. $_url['host'].'' . $path;

        $agent= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0';
        $ch = curl_init();
        $headers = [
            'Accept: */*',
            'Accept-Language: fa-IR;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding: ""',
            'Referer: https://www.google.com/search?q=%D8%A8%D8%A7%D8%B4%DA%AF%D8%A7%D9%87+%D8%AE%D8%A8%D8%B1%D9%86%DA%AF%D8%A7%D8%B1%D8%A7%D9%86+%D8%AC%D9%88%D8%A7%D9%86&sxsrf=AOaemvL2I4eaIzZfdPsBvb1H-Kc5RbDlkQ%3A1639887332154&source=hp&ei=5LG-Yb_7BNqNxc8Pjuu7CA&iflsig=ALs-wAMAAAAAYb6_9ExEG5yZIOF4kI3bydOYkApyMqBR&oq=%D8%A8%D8%A7%D8%B4%DA%AF%D8%A7%D9%87&gs_lcp=Cgdnd3Mtd2l6EAMYADIFCAAQywEyBQguEMsBMgUILhDLATIFCAAQywEyBQgAEMsBMgUIABDLATIFCAAQywEyBQgAEMsBMgUILhDLATIFCAAQywE6BwgjEOoCECc6BggjECcQEzoECCMQJzoFCAAQgAQ6BQguEIAEULdLWJtWYP5faAFwAHgAgAHmAYgB-wmSAQMyLTaYAQCgAQGwAQo&sclient=gws-wiz',
            'Connection: keep-alive'
        ];
        //Accept-Encoding: gzip, deflate, br

        curl_setopt($ch, CURLOPT_ENCODING, '');
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_VERBOSE, true);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_USERAGENT, $agent);
        curl_setopt($ch, CURLOPT_URL,($url));
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        $result=curl_exec($ch);
        $httpcode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        if($httpcode != 200) {
            $basicRequest = file_get_contents($url);

            if(!empty($basicRequest)) return $basicRequest;
        }

        return $result;
    }

    // ----------------------------------------------------------

    /**
     * Load given RSS/Atom URL
     * 
     * @param string $url   RSS/Atom URL Address
     * 
     * @return array   loaded RSS
     */
    public function loadNews(string $url):array
    {
        $parsedURL = parse_url($url);
        
        $supportedSources = [
            'irna.ir'               => 'Irna',
            'isna.ir'               => 'Isna',
            'farsnews.ir'           => 'FarsNews',
            'khabaronline.ir'       => 'KhabarOnline',
            'mehrnews.com'          => 'MehrNews',
            'mashreghnews.ir'       => 'MashreghNews',
            'eghtesadotejarat.ir'   => 'EghtesadoTejarat',
            'eghtesadgooya.ir'      => 'EghtesadGooya',
            'eghtesadnews.com'      => 'EghtesadNews',
            'honaronline.ir'        => 'HonarOnline',
            'yjc.news'              => 'Yjc',
        ];

        // loop through supported sources
        foreach ($supportedSources as $sourceURL => $objectName) {


            if(strpos($url, $sourceURL) !== false)
            {
                // create object name
                $obj = '\RSSImporter\Included\Importer\CrawlTargets\\'.$objectName;
                
                // does object exists? if not skip $url
                if(!class_exists($obj)) continue;

                // create object itself
                $worker = new $obj;

                // load and return the news
                return ($worker->loadNews($url));
            }
            continue; 
        }
        return [];
    }

    /**
     * Crawl!
     *  Crawl the actual post, from the site, and export the required data.
     *  this proccess will be handled by their respective libraries (Irna.php, Isna.php)
     *  and the result must always satisfy the return policy
     *  This is where things get more intense, major news outlets do not like us taking
     *  their posts, so we will be blocked at some point.
     * 
     * @param object $feed   A feed object
     * 
     * @return array   [crawled: array of crawled data, error: array of blocked content]
     */
    public function crawl($feed)
    {

        // RSS/Atom Reader
        $reader = new \RSSImporter\Included\Importer\Reader;
        
        // predefine result structure
        $result = [
            'crawled'   => [],
            'error'     => [],
        ];

        // try to load rss/atom
        try {
            // Feed loaded
            $Feed = @$reader::loadRss(@$feed->feed_url);
            // found news on the feed?
            if(($Feed->item))
            {
                // loop the news
                foreach ($Feed->item as $item) {

                    // Item exists in our db, skip
                    if(self::getNewsByURL($item->url)) continue;

                    // IT IS indeed better not to spam requests from our own IP Address,
                    // since it WILL eventually be banned or flagged.
                    // the LEAST we can do is making our script to run just a little bit slower
                    // for the hundres of repetitive request that we are going to make
                    usleep(500);
                    
                    try {
                        // try to read the news from the source url,

                        $loaded = ($this->loadNews($item->url));
                        // news has been read, save the url
                        $loaded['url'] = (string)$item->url[0];

                        if(empty($loaded['categories']))
                        {#
                            #var_dump($loaded['categories']);
                            #die();
                        }
                        try {
                            // save the pending post
                            $this->savePending($feed->id, $loaded);
                            
                            //code...
                        } catch (\Exception $th) {
                            // oops, db gone wrong or smt?
                            continue;
                        };
                        
                        // add to crawled result.
                        $result['crawled'][] = $loaded;
                    }
                    catch (\Exception $e) {
                        // Error happened ...
                        if($e->getMessage() == 'Could not crawl data.')
                        {
                            $error = $e->getMessage();
                            $canRead = FALSE;
                            $result['error'][] = $error;
                        }
                        else return $e->getMessage();
                        
                    }

                }
            }
            //code...
        } catch (\RSSImporter\Included\Importer\FeedException $th) {
            $Feed = NULL;
            // feed error
            return $th->getMessage();
        }

        return $result;
    }

    // ----------------------------------------------------------
    /**
     * Methods below are mainly db or crawl-helper related.
     */
    // ----------------------------------------------------------

    /**
     * Get saved news by URL
     * 
     * @param string $newsURL   News URL
     * @return object
     */
    public static function getNewsByURL(string $newsURL)
    {
        global $wpdb;
        
        $table = $wpdb->prefix . 'rssimporter_news';


        $result = $wpdb->get_results ( "SELECT * FROM  $table
            WHERE url = '$newsURL'
        ");
        return @$result[0];
    }

    // ----------------------------------------------------------

    /**
     * Get saved news by ID
     * 
     * @param string $newsID   News ID
     * @return object
     */
    public static function getNewsByID(int $newsID)
    {
        global $wpdb;
        
        $table = $wpdb->prefix . 'rssimporter_news';


        $result = $wpdb->get_results ( "SELECT * FROM  $table
            WHERE id = '$newsID'
        ");
        return @$result[0];
    }

    // ----------------------------------------------------------

    /**
     * Check if the given news exists
     * 
     * @param object $item   News RSS Item
     * @return bool
     */
    public static function newsExists($item)
    {
        global $wpdb;
        
        $table = $wpdb->prefix . 'rssimporter_news';

        $feed_url = (string)$item->guid;

        $result = $wpdb->get_results ( "SELECT * FROM  $table
            WHERE url = '$feed_url'
        ");
        return !empty($result);
    }

    // ----------------------------------------------------------

    /**
     * Get Source By ID
     * 
     * @param int $id   Feed ID
     * @return array|null   Feed Data
     */
    public static function getSourceByID(int $id)
    {
        global $wpdb;
        
        $table = $wpdb->prefix . 'rssimporter_feeds';

        $result = $wpdb->get_results ( "SELECT * FROM  $table
            WHERE id = $id
        ");

        return @$result[0];
    }

    // ----------------------------------------------------------

    public static function extractCommonWords($string){ 
        return null;
    }

    // ----------------------------------------------------------

    /**
     * Get Source By ID
     * 
     * @param int $id   Feed ID
     * @return array|null   Feed Data
     */
    public static function publishNewsByID(int $newsID)
    {
        global $wpdb;

        if ( ! function_exists( 'wp_generate_attachment_metadata ' ) ) {
            include_once( ABSPATH . 'wp-admin/includes/image.php' );
        }
        
        $news = self::getNewsByID($newsID);

        $source = self::getSourceByID($news->feed_id);

        $parent = null;
        
        $catIDs = [];
        
        if($source->imported_categories == 0)
        {

            foreach (explode(",",$news->categories) as $key => $cat)
            {
                // cant have empty category
                $cat = trim($cat);

                if(empty($cat)) continue;
                if(!$parent)
                {

                    $category = get_term_by('name', $cat, 'category');
                    if(!$category)
                    {
                        //create the main category
                        $parent = (object)wp_insert_term(
                            // the name of the category
                            $cat, 
                            'category', 
                            array(
                            
                                'slug' => $cat,  
                            )
                        );
                    }
                    else
                        $parent = $category;

                    $catIDs[] = $parent->term_id;
                }
                else
                { 
                    $category = get_term_by('name', $cat, 'category');
                    if(!$category)
                    {

                        if(!isset($parent->term_id))
                        var_dump($parent);

                        $parent = (object)wp_insert_term(

                            // the name of the sub-category
                            $cat, 
                            
                            // the taxonomy 'category' (don't change)
                            'category',
                            
                            array(
                                // what to use in the url for term archive
                                'slug' => $cat, 
                                // link with main category. In the case, become a child of the "Category A" parent  
                                'parent'=> $parent->term_id
                            
                            )
                        );
                    }
                    else
                    {
                        $parent = $category;

                    }

                    $catIDs[] = $parent->term_id ;

                }
            }
        }
        else
        {
            $catIDs[] = $source->imported_categories;
        }

        global $user_ID;

        if(!$user_ID) // if not user, (is cronjob), use plugin's custom user
        {
            // do we have a user already?
            $user_ID = get_option("RSSImporter_publisher_user_id");
            
            if(!$user_ID) // custom user not found, create a user for plugin
            {
                $user_ID = wp_insert_user( array(
                    'user_login' => 'ri_publisher',
                    'user_pass' => md5(time()), // random pw
                    'user_email' => null,
                    'first_name' => 'مدیر اخبار',
                    'last_name' => '',
                    'display_name' => 'مدیر اخبار',
                    'role' => 'editor'
                ));
            }
        }
        $repl = json_decode($source->replaceables, TRUE);

        $content = '';

        //
        $intro = self::crawledContent($news->intro ?:"", $repl);

        //
        $newsLeadMetaKEY = get_option("RSSImporter_theme_lead_metakey");

        if(!$newsLeadMetaKEY)
        {
            $content.= $intro;
        }

        $content.= self::crawledContent($news->full_content ?:"", $repl);

        if($source->add_source_link)
        {
            $content .= "<br/>";
 
            $_url = pathinfo($source->feed_url, PATHINFO_DIRNAME);

            $content .= "<a href='{$_url}'>{$source->name}</a>";
        }

        $keywords = $news->tags; //implode(",", self::extractCommonWords($content));

        $date = _jdate(time());
            
        $now = $date->toCarbon()->format("Y-m-d H:i:s");
            
        $new_post = array(
            'post_title' => $news->title, // !($newsLeadMetaKEY && $intro) ? $intro : $news->title,
            'post_content' => $content,
            'post_status' => 'pending',
            'post_date' => $now,
            'post_author' => $user_ID,
            'post_type' => 'post',
            'post_category' => $catIDs,
            'meta_input' => array(
                'news_url' => $news->url,
            )
        );
        $post_id = wp_insert_post($new_post);
        if($keywords)
        {
            // add tags
            wp_set_post_terms( $post_id, explode(",", $keywords));
        }
        
        $table = $wpdb->prefix . 'rssimporter_news';
        
        // update news draft status to published
        $wpdb->update($table, [
            'status' => 1,
            'updated_at'    => $now // important updated_at
        ], ['id'=>$news->id]);


        // THE NEWS LEAD, EACH TEMPLATE HAS ITS OWN META KEY FOR NEWS LEAD
        if($newsLeadMetaKEY && $intro)
        {
            add_post_meta( $post_id, $newsLeadMetaKEY, $intro, true );
        }

        if(!empty($news->thumbnail))
        {

            $uploaddir = wp_upload_dir();
            $uploadfile = $uploaddir['path'] . '/' . @basename($news->thumbnail);
            $contents= self::request($news->thumbnail);
            if($contents)
            {
                $savefile = file_put_contents($uploadfile, $contents);


                $watermarkImg = get_option("RSSImporter_watermark_path");

                if(file_exists($watermarkImg) && !empty($source->watermark_position))
                {
                    // Initiate with source image or pdf
                    $watermark = new Watermark($uploadfile); 
                    
                    // Watermark with Image
                    switch ($source->watermark_position) {
                        case 'POSITION_TOP_LEFT':
                            $position = 'NorthWest';
                            break;
                        
                        case 'POSITION_TOP':
                            $position = 'North';
                            break;
                
                        case 'POSITION_TOP_RIGHT':
                            $position = 'NorthEast';
                            break;
            
                        case 'POSITION_LEFT':
                            $position = 'West';
                            break;
        
                        case 'POSITION_CENTER':
                            $position = 'Center';
                            break;
                
                        case 'POSITION_RIGHT':
                            $position = 'East';
                            break;
            
                        case 'POSITION_BOTTOM_LEFT':
                            $position = 'SouthWest';
                            break;
    
                        case 'POSITION_BOTTOM':
                            $position = 'South';
                            break;
        
                        case 'POSITION_BOTTOM_RIGHT':
                            $position = 'SouthEast';
                            break;
                                                                        
                        default:
                            $position = 'SouthEast';
                            break;
                    }
                    $watermark->setPosition($position);
                    $watermark->withImage($watermarkImg, $uploadfile);
                }
                
                $wp_filetype = wp_check_filetype(basename($news->thumbnail), null );
    
                $attachment = array(
                    'post_mime_type' => $wp_filetype['type'],
                    'post_title' => basename($news->thumbnail),
                    'post_content' => $news->title,
                    'post_status' => 'inherit'
                );
                
                $attach_id = wp_insert_attachment( $attachment, $uploadfile );
                
                $imagenew = get_post( $attach_id );
                $fullsizepath = get_attached_file( $imagenew->ID );
                $attach_data = wp_generate_attachment_metadata( $attach_id, $fullsizepath );
                wp_update_attachment_metadata( $attach_id, $attach_data );
                set_post_thumbnail( $post_id, $attach_id );
                
                // update saved recount
                \NewsFeedTable::record_count();
                \NewsFeedTable::record_count_posted();
            }
        }
        
        return $post_id;
    }

    // ----------------------------------------------------------

    /**
     * Expectes $_POST[] data
     */
    public function addFeed()
    {
        global $wpdb;
        
        $table = $wpdb->prefix . 'rssimporter_feeds';
        $replaceables = [];

        if(!empty(@$_POST['replace_from']) && !empty(@$_POST['replace_to']))
        {
            
            foreach ($_POST['replace_from'] as $key => $value) {
                $replaceables[$value] = @$_POST['replace_to'][$key];
            }
        }
        
        $feed_url = @$_POST['feed_url'];
        $name = @$_POST['name'];
        $description = @$_POST['description'];

        //
        $result = $wpdb->get_results ( "SELECT * FROM  $table
                WHERE feed_url = '$feed_url'
        ");

        if(empty($result))
        {
            $date = _jdate(time());
                
            $now = $date->toCarbon()->format("Y-m-d H:i:s");

            $toCat          = (@$_POST['select_category'] && @$_POST['cat']) ? (int)$_POST['cat'] : 0;
            $change_links_to = (@$_POST['change_links_to'] && @$_POST['change_links_to_url']) ? $_POST['change_links_to_url'] : '';
            return $wpdb->insert($table, array(
                'name'      => $name,
                'description' => $description,
                'replaceables'  => json_encode($replaceables),
                'feed_url' => $feed_url,
                'imported_categories' => $toCat,
                'change_links_to' => $change_links_to,
                'watermark_position' => @$_POST['watermark_position'] ?: '',
                'add_source_link' => '', 
                'is_active' => '1',
                'crawled_at' => null,
                'created_at' => $now
            ));
        }
        throw new \Exception("این فید در سیستم وجود دارد", 1);
    }

    // ----------------------------------------------------------

    /**
     * Update a feed record
     * 
     * @param int $id   feed id
     * @param array $data   data to update
     * 
     * @return mixed
     */
    public static function updateFeed(int $id, array $data)
    {
        global $wpdb;
        $table = $wpdb->prefix . 'rssimporter_feeds';
        
        return $wpdb->update($table, $data, ['id'=>$id]);
    }

    // ----------------------------------------------------------

    /**
     * Count news for a given feed
     * 
     * @param int $feedID   feed id
     * @param bool $saved   whether or not to include the news that have been posted in our site
     * 
     * @return mixed
     */
    public static function countNewsForFeed(int $feedID, bool $saved = false)
    {
        global $wpdb;
        $table = $wpdb->prefix . 'rssimporter_news'; 

        $sql = "SELECT COUNT(*) FROM {$table} as feeds WHERE feed_id = $feedID";
        $count = $wpdb->get_var( $sql );

        return $count;
    }

    // ----------------------------------------------------------

    /**
     * Get a list of feeds to crawl
     * 
     * @return array feeds
     */
    public function getActiveFeeds()
    {
        global $wpdb;
        
        $table = $wpdb->prefix . 'rssimporter_feeds';

        $feed_url = @$_POST['feed_url'];

        $result = $wpdb->get_results ( "SELECT * FROM  $table
            WHERE is_active = 1
        ");

        return $result;
    }

    // ----------------------------------------------------------

    /**
     * Replace Predefined keywords with new strings
     * 
     * @param string $content       content to replace data in
     * @param array  $data          arrat of data to replace [replacing key to value]
     * 
     * @return string
     */
    protected static function crawledContent(string $content, array $data)
    {
        foreach ($data as $key => $value) {
            $content = str_replace($key, $value, $content);
        }

        return trim(nl2br($content));
    }

    // ----------------------------------------------------------
    
    public function savePending(int $feedId, array $news)
    {
        global $wpdb;
        
        if(empty(@$news['title']) || empty(@$news['body']))
            throw new \Exception("Incomplete news data.", 1);
            
        $table = $wpdb->prefix . 'rssimporter_news';

        $newsURL = @urldecode($news['url']);
        $exists = $wpdb->get_results ( "SELECT * FROM  $table
                WHERE `feed_id` = '$feedId'
                AND `url` = '{$newsURL}'
        ");

        $feed = self::getSourceByID($feedId);
        $repl = json_decode($feed->replaceables, TRUE);

        $cats = @$news['categories'];

        if(is_array($cats))
        {
            $cats = implode("," , $cats);
        }
        

        if(empty($exists))
        {  
            $intro = self::crawledContent($news['introtext'] ?: "", $repl);
            $content = self::crawledContent($news['body'] ?: "", $repl);
            $tags = is_array(@$news['tags']) ? implode(",", $news['tags']) : (@$news['tags'] === NULL ? '' : @$news['tags']);
            
            $date = _jdate(time());
                
            $now = $date->toCarbon()->format("Y-m-d H:i:s");

            $go = array(
                'url' => $newsURL,
                'thumbnail' => @$news['main_image'], 
                'feed_id' => $feedId, 
                'title' => $news['title'],
                'intro' => $intro,
                'full_content' => $content,
                'categories' => $cats, 
                'tags' => $tags, 
                'status' => '0',
                'created_at' => $now,
                'updated_at' => $now
            );

            $insert = $wpdb->insert($table, $go);

            return $insert;
        }
        else
            throw new \Exception("Cannot save duplicate news.", 1);
    } 
}

function my_print_error(){

    global $wpdb;

    if($wpdb->last_error !== '') :

        $str   = htmlspecialchars( $wpdb->last_result, ENT_QUOTES );
        $query = htmlspecialchars( $wpdb->last_query, ENT_QUOTES );

        print "<div id='error'>
        <p class='wpdberror'><strong>WordPress database error:</strong> [$str]<br />
        <code>$query</code></p>
        </div>";

    endif;

}