HEX
Server: Apache/2.4.6 (CentOS) OpenSSL/1.0.2k-fips PHP/7.4.30
System: Linux iZj6c1151k3ad370bosnmsZ 3.10.0-1160.76.1.el7.x86_64 #1 SMP Wed Aug 10 16:21:17 UTC 2022 x86_64
User: root (0)
PHP: 7.4.30
Disabled: NONE
Upload Files
File: //var/www/html/phpmyfaq/src/phpMyFAQ/LinkVerifier.php
<?php

/**
 * The Linkverifier class provides methods and functions for verifying URLs.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public License,
 * v. 2.0. If a copy of the MPL was not distributed with this file, You can
 * obtain one at http://mozilla.org/MPL/2.0/.
 *
 * The Initial Developer of the Original Code is released for external use
 * with permission from NetJapan, Inc. IT Administration Group.
 *
 * @package   phpMyFAQ
 * @author    Minoru TODA <todam@netjapan.co.jp>
 * @author    Matteo Scaramuccia <matteo@scaramuccia.com>
 * @author    Thorsten Rinne <thorsten@phpmyfaq.de>
 * @copyright 2005-2022 NetJapan, Inc. and phpMyFAQ Team
 * @license   http://www.mozilla.org/MPL/2.0/ Mozilla Public License Version 2.0
 * @link      https://www.phpmyfaq.de
 * @since     2005-08-01
 */

namespace phpMyFAQ;

/**
 * Class LinkVerifier
 *
 * @package phpMyFAQ
 */
class LinkVerifier
{
    /**
     * Defines number of times link verifier follows 302 response before failing.
     */
    private const LINKVERIFIER_MAX_REDIRECT_COUNT = 10;

    /**
     * Defines the number of seconds to wait for the remote server to respond.
     */
    private const LINKVERIFIER_CONNECT_TIMEOUT = 5;

    /**
     * Defines the number of seconds to wait for the remote server to send data.
     */
    private const LINKVERIFIER_RESPONSE_TIMEOUT = 10;

    /**
     * List of protocol and urls.
     *
     * @var array
     */
    private $urlPool = [];

    /**
     * List of protocols we do not want to look at.
     *
     * @var array
     */
    private $invalidProtocols = [];

    /**
     * Last verify results (we might use it later).
     *
     * @var array
     */
    private $lastResult = [];

    /**
     * List of hosts that are slow to resolve.
     *
     * @var array
     */
    private $slowHosts = [];

    /**
     * User.
     *
     * @var int
     */
    private $user = null;

    /**
     * @var Configuration
     */
    private $config = null;

    /**
     * Constructor.
     *
     * @param Configuration $config
     * @param string        $user   User
     */
    public function __construct(Configuration $config, string $user = null)
    {
        global $PMF_LANG;

        $this->config = $config;
        $this->user = $user;

        if (!extension_loaded('openssl')) {
            $this->addIgnoreProtocol('https:', sprintf($PMF_LANG['ad_linkcheck_protocol_unsupported'], 'https'));
        }

        $this->addIgnoreProtocol('ftp:', sprintf($PMF_LANG['ad_linkcheck_protocol_unsupported'], 'ftp'));
        $this->addIgnoreProtocol('gopher:', sprintf($PMF_LANG['ad_linkcheck_protocol_unsupported'], 'gopher'));
        $this->addIgnoreProtocol('mailto:', sprintf($PMF_LANG['ad_linkcheck_protocol_unsupported'], 'mailto'));
        $this->addIgnoreProtocol('telnet:', sprintf($PMF_LANG['ad_linkcheck_protocol_unsupported'], 'telnet'));
        $this->addIgnoreProtocol('feed:', sprintf($PMF_LANG['ad_linkcheck_protocol_unsupported'], 'feed'));

        // Hack: these below are not real scheme for defining protocols like the ones above
        $this->addIgnoreProtocol('file:', sprintf($PMF_LANG['ad_linkcheck_protocol_unsupported'], 'file'));
        $this->addIgnoreProtocol('javascript:', sprintf($PMF_LANG['ad_linkcheck_protocol_unsupported'], 'javascript'));
    }

    /**
     * Adds protocols we want to ignore to an array, executed in constructor.
     *
     * @param string $protocol
     * @param string $message
     *
     * @return bool true, if successfully added, otherwise false
     */
    protected function addIgnoreProtocol(string $protocol = '', string $message = ''): bool
    {
        if ('' !== $protocol) {
            $this->invalidProtocols[strtolower($protocol)] = $message;
            return true;
        } else {
            return false;
        }
    }

    /**
     * Get current urls.
     *
     * @return array
     */
    public function getUrlPool()
    {
        return $this->urlPool;
    }

    /**
     * Returns the HTML text that needs to be shown in entry listing.
     *
     * @param int    $id
     * @param string $faqLang
     *
     * @return string
     */
    public function getEntryStateHTML(int $id = 0, string $faqLang = '')
    {
        global $PMF_LANG;

        // Check if feature is disabled.
        if ($this->isReady() === false) {
            return sprintf(
                '<span class="fa-stack" aria-hidden="true"><i class="fa fa-link fa-stack-1x"></i>' .
                '<i class="fa fa-ban fa-stack-2x text-danger" title="%s"></i></span>',
                $PMF_LANG['ad_linkcheck_feedback_url-disabled']
            );
        }

        // check if article entry exists (we should not need this)
        $src = $this->getEntryState($id, $faqLang, false);
        if ($src === false) {
            return sprintf(
                '<span class="fa-stack" aria-hidden="true"><i class="fa fa-link fa-stack-1x"></i>' .
                '<i class="fa fa-ban fa-stack-2x text-danger" title="%s"></i></span>',
                $PMF_LANG['ad_linkcheck_feedback_url-disabled']
            );
        }

        if ($src === true) {
            $src = 'noscript';
        }

        // define name for javascripting
        $spanId = 'spanurl_' . $faqLang . '_' . $id;
        $divId = 'divurl_' . $faqLang . '_' . $id;

        $output = sprintf(
            '<div id="%s" class="url-%s"><span id="%s"><a href="javascript:onDemandVerifyURL(%d,\'%s\');">%s</a>' .
            '</span></div>',
            $divId,
            $src,
            $spanId,
            $id,
            $faqLang,
            $PMF_LANG['ad_linkcheck_feedback_url-' . $src]
        );

        return $output;
    }

    /**
     * Returns whether link verifier is ready to verify URLs.
     *
     * @return bool true if ready to verify URLs, otherwise false
     */
    public function isReady(): bool
    {
        if (is_null($this->config->getDefaultUrl()) || '' !== $this->config->getDefaultUrl()) {
            return false;
        }

        return true;
    }

    /**
     * retrieves stored link state and validates timestamp.
     *
     * @param int    $id
     * @param string $faqLang
     * @param bool   $checkDate
     *
     * @return bool|string
     */
    public function getEntryState($id = 0, $faqLang = '', $checkDate = false)
    {
        $interval = $this->getURLValidateInterval();
        $query = sprintf(
            "
            SELECT 
                links_state, links_check_date 
            FROM 
                %sfaqdata 
            WHERE 
                id = %d 
            AND 
                lang = '%s'",
            Database::getTablePrefix(),
            $id,
            $this->config->getDb()->escape($faqLang)
        );

        if ($result = $this->config->getDb()->query($query)) {
            while ($row = $this->config->getDb()->fetchObject($result)) {
                $_linkState = $row->links_state;
                if (trim($_linkState) == '') {
                    $_linkState = true;
                }

                if ($row->links_check_date > $interval) {
                    return $_linkState;
                } else {
                    if ($checkDate == false) {
                        return $_linkState;
                    } else {
                        return true;
                    }
                }
            }
        } else {
            return false;
        }
    }

    /**
     * Retrieves the oldest timestamp for stored link validation result.
     *
     * @return int
     */
    public function getURLValidateInterval(): int
    {
        if ($this->config->get('main.urlValidateInterval') != '') {
            $requestTime = $_SERVER['REQUEST_TIME'] - $this->config->get('main.urlValidateInterval');
        } else {
            // default in recheck links once a day unless explicitly requested.
            $requestTime = $_SERVER['REQUEST_TIME'] - 86400;
        }

        return $requestTime;
    }

    /**
     * Verifies specified article content and update links_state database entry.
     *
     * @param string $contents
     * @param int    $id
     * @param string $faqLang
     * @param bool   $cron
     *
     * @return string HTML text, if $cron is false (default)
     */
    public function verifyArticleURL($contents = '', $id = 0, $faqLang = '', $cron = false)
    {
        global $PMF_LANG;

        if ($this->config->getDefaultUrl() === '') {
            $output = $PMF_LANG['ad_linkcheck_noReferenceURL'];

            return ($cron ? '' : sprintf('<p class="alert alert-warning">%s</p>', $output));
        }

        if (trim('' == $this->config->getDefaultUrl())) {
            $output = $PMF_LANG['ad_linkcheck_noReferenceURL'];

            return ($cron ? '' : sprintf('<p class="alert alert-warning">%s</p>', $output));
        }

        if ($this->isReady() === false) {
            $output = $PMF_LANG['ad_linkcheck_noAllowUrlOpen'];

            return ($cron ? '' : sprintf('<p class="alert alert-warning">%s</p>', $output));
        }

        // Parse contents and verify URLs
        $this->parseString($contents);
        $result = $this->verifyURLs($this->config->getDefaultUrl());
        $this->markEntry($id, $faqLang);

        // If no URLs found
        if ($result == false) {
            $output = sprintf(
                '<h3>%s</h3><p class="alert alert-info">%s</p>',
                $PMF_LANG['ad_linkcheck_checkResult'],
                $PMF_LANG['ad_linkcheck_noLinksFound']
            );

            return ($cron ? '' : $output);
        }

        $failreasons = $inforeasons = [];
        $output = '    <h3>' . $PMF_LANG['ad_linkcheck_checkResult'] . "</h3>\n";
        $output .= '    <table class="table">' . "\n";
        foreach ($result as $type => $_value) {
            $output .= '        <tr><td><strong>' . Strings::htmlspecialchars($type) . "</strong></td></tr>\n";
            foreach ($_value as $value) {
                $_output = '            <td />';
                $_output .= '            <td><a href="' . $value['absurl'] . '" target="_blank">' .
                    Strings::htmlspecialchars($value['absurl']) . "</a></td>\n";
                $_output .= '            <td>';
                if (isset($value['redirects']) && ($value['redirects'] > 0)) {
                    $_redirects = '(' . $value['redirects'] . ')';
                } else {
                    $_redirects = '';
                }
                if ($value['valid'] === true) {
                    $_classname = 'urlsuccess';
                    $_output .= '<td class="' . $_classname . '">' . $PMF_LANG['ad_linkcheck_checkSuccess'] .
                        $_redirects . '</td>';
                    if ($value['reason'] != '') {
                        $inforeasons[] = sprintf(
                            $PMF_LANG['ad_linkcheck_openurl_infoprefix'],
                            Strings::htmlspecialchars($value['absurl'])
                        ) . $value['reason'];
                    }
                } else {
                    $_classname = 'urlfail';
                    $_output .= '<td class="' . $_classname . '">' . $PMF_LANG['ad_linkcheck_checkFailed'] . '</td>';
                    if ($value['reason'] != '') {
                        $failreasons[] = $value['reason'];
                    }
                }
                $_output .= '</td>';
                $output .= '        <tr class="' . $_classname . '">' . "\n" . $_output . "\n";
                $output .= "        </tr>\n";
            }
        }
        $output .= "    </table>\n";

        if (count($failreasons) > 0) {
            $output .= "    <br>\n    <strong>" . $PMF_LANG['ad_linkcheck_failReason'] . "</strong>\n    <ul>\n";
            foreach ($failreasons as $reason) {
                $output .= '        <li>' . $reason . "</li>\n";
            }
            $output .= "    </ul>\n";
        }

        if (count($inforeasons) > 0) {
            $output .= "    <br>\n    <strong>" . $PMF_LANG['ad_linkcheck_infoReason'] . "</strong>\n    <ul>\n";
            foreach ($inforeasons as $reason) {
                $output .= '        <li>' . $reason . "</li>\n";
            }
            $output .= "    </ul>\n";
        }

        if ($cron) {
            return '';
        } else {
            return $output;
        }
    }

    /**
     * This function parses HTML and extracts URLs and returns the number of
     * URLs found.
     *
     * @param string $string String
     *
     * @return int
     */
    public function parseString($string = '')
    {
        $urlCount = 0;
        $types = ['href', 'src'];
        $matches = [];

        // Clean $this->urlpool
        $this->urlPool = [];
        foreach ($types as $type) {
            preg_match_all("|[^?&]$type\=(\"?'?`?)([[:alnum:]\:\#%?=;&@/\ \.\_\-\{\}]+)\\1|i", $string, $matches);
            $sz = sizeof($matches[2]);
            for ($i = 0; $i < $sz; ++$i) {
                $this->urlPool[$type][] = $matches[2][$i];
                ++$urlCount;
            }
        }

        return $urlCount;
    }

    /**
     * Perform link validation to each URLs found.
     *
     * @param string $referenceUri
     *
     * @return array
     */
    public function verifyURLs($referenceUri = '')
    {
        $this->lastResult = [];

        foreach ($this->urlPool as $_type => $_value) {
            foreach ($_value as $_key => $_url) {
                if (!(isset($result[$_type][$_url]))) {
                    $_result = [];
                    $_result['type'] = $_type;
                    $_result['rawurl'] = $_url;
                    $_result['reference'] = $referenceUri;

                    // Expand uri into absolute URL.
                    $_absurl = $this->makeAbsoluteURL($_url, $referenceUri);
                    $_result['absurl'] = $_absurl;

                    list($_result['valid'], $_result['redirects'], $_result['reason']) = $this->openURL($_absurl);
                    $this->lastResult[$_type][$_url] = $_result;
                }
            }
        }

        return $this->lastResult;
    }

    /**
     * This function converts relative uri into absolute uri using specific reference point.
     * For example:
     *   $relativeUri  = "test/foo.html"
     *   $referenceUri = "http://example.com:8000/sample/index.php"
     * will generate "http://example.com:8000/sample/test/foo.html".
     *
     * @param string $relativeUri
     * @param string $referenceUri
     *
     * @return string $result
     */
    protected function makeAbsoluteURL($relativeUri = '', $referenceUri = '')
    {
        // If relative URI is protocol we don't want to handle, don't process it.
        foreach ($this->invalidProtocols as $protocol => $message) {
            if (Strings::strpos($relativeUri, $protocol) === 0) {
                return $relativeUri;
            }
        }

        // If relative URI is absolute URI, don't process it.
        foreach (['http://', 'https://'] as $protocol) {
            if (Strings::strpos($relativeUri, $protocol) === 0) {
                return $relativeUri;
            }
        }

        // Split reference uri into parts.
        $pathParts = parse_url($referenceUri);

        // If port is specified in reference uri, prefix with ":"
        if (isset($pathParts['port']) && $pathParts['port'] !== '') {
            $pathParts['port'] = ':' . $pathParts['port'];
        } else {
            $pathParts['port'] = '';
        }

        // If path is not specified in reference uri, set as blank
        if (isset($pathParts['path'])) {
            $pathParts['path'] = str_replace('\\', '/', $pathParts['path']);
            $pathParts['path'] = preg_replace("/^.*(\/)$/i", '', $pathParts['path']);
        } else {
            $pathParts['path'] = '';
        }

        // Recombine urls
        if ('/' !== Strings::substr($relativeUri, 0, 1)) {
            $relativeUri = $pathParts['path'] . '/' . $relativeUri;
        }

        return sprintf(
            '%s://%s%s%s',
            $pathParts['scheme'],
            $pathParts['host'],
            $pathParts['port'],
            $relativeUri
        );
    }

    /**
     * Checks whether a URL can be opened.
     * if $redirect is specified, will handle Location: redirects.
     *
     * @param string $url
     * @param string $redirect
     * @param int    $redirectCount
     *
     * @return array
     */
    protected function openURL($url = '', $redirect = '', $redirectCount = 0)
    {
        global $PMF_LANG;

        // If perquisites fail
        if (false === $this->isReady()) {
            return [false, $redirectCount, $PMF_LANG['ad_linkcheck_openurl_notready']];
        }

        // Recurring too much ?
        if (($redirectCount >= self::LINKVERIFIER_MAX_REDIRECT_COUNT) || ($url == $redirect)) {
            return [
                false,
                $redirectCount,
                sprintf(
                    $PMF_LANG['ad_linkcheck_openurl_maxredirect'],
                    self::LINKVERIFIER_MAX_REDIRECT_COUNT
                ),
            ];
        }

        // If destination is blank, fail.
        if ('' === trim($url)) {
            return [false, $redirectCount, $PMF_LANG['ad_linkcheck_openurl_urlisblank']];
        }

        if ('' !== $redirect) {
            $url = $this->makeAbsoluteURL($redirect, $url);
        }

        // parse URL
        $defaultParts = [
            'scheme' => 'http',
            'host' => $_SERVER['HTTP_HOST'],
            'user' => '',
            'pass' => '',
            'path' => '/',
            'query' => '',
            'fragment' => '',
        ];
        $urlParts = @parse_url($url);
        foreach ($defaultParts as $key => $value) {
            if (!(isset($urlParts[$key]))) {
                $urlParts[$key] = $value;
            }
        }

        if (!(isset($urlParts['port']))) {
            switch ($urlParts['scheme']) {
                case 'https':
                    $urlParts['port'] = 443;
                    break;
                default:
                    $urlParts['port'] = 80;
                    break;
            }
        }

        // Hack: fix any unsafe space chars in any component of the path to avoid HTTP 400 status during HEAD crawling
        if ('' !== $urlParts['path']) {
            $urlSubParts = explode('/', $urlParts['path']);
            $num = count($urlSubParts);
            for ($i = 0; $i < $num; ++$i) {
                $urlSubParts[$i] = str_replace(' ', '%20', $urlSubParts[$i]);
            }
            $urlParts['path'] = implode('/', $urlSubParts);
        }

        if ('' !== $urlParts['query']) {
            $urlParts['query'] = '?' . $urlParts['query'];
        }

        if ('' !== $urlParts['fragment']) {
            $urlParts['fragment'] = '#' . $urlParts['fragment'];
        }

        // Check whether we tried the host before
        if (isset($this->slowHosts[$urlParts['host']])) {
            return [
                false,
                $redirectCount,
                sprintf(
                    $PMF_LANG['ad_linkcheck_openurl_tooslow'],
                    Strings::htmlspecialchars($urlParts['host'])
                ),
            ];
        }

        // Check whether the hostname exists
        if (gethostbynamel($urlParts['host']) === false) {
            // mark this host too slow to verify
            $this->slowHosts[$urlParts['host']] = true;

            return [
                false,
                $redirectCount,
                sprintf(
                    $PMF_LANG['ad_linkcheck_openurl_nodns'],
                    Strings::htmlspecialchars($urlParts['host'])
                ),
            ];
        }

        $_response = '';

        // open socket for remote server with timeout (default: 5secs)
        $_host = $urlParts['host'];
        if (@extension_loaded('openssl') && ('https' == $urlParts['scheme'])) {
            $_host = 'ssl://' . $_host;
        }

        $fp = @fsockopen($_host, $urlParts['port'], $errno, $errstr, self::LINKVERIFIER_CONNECT_TIMEOUT);

        if (!$fp) {
            // mark this host too slow to verify
            $this->slowHosts[$urlParts['host']] = true;

            return [
                false,
                $redirectCount,
                sprintf(
                    $PMF_LANG['ad_linkcheck_openurl_tooslow'],
                    Strings::htmlspecialchars($urlParts['host'])
                ),
            ];
        }

        // wait for data with timeout (default: 10secs)
        stream_set_timeout($fp, self::LINKVERIFIER_RESPONSE_TIMEOUT, 0);
        $_url = $urlParts['path'] . $urlParts['query'] . $urlParts['fragment'];
        fputs($fp, 'HEAD ' . $_url . " HTTP/1.0\r\nHost: " . $urlParts['host'] . "\r\n");
        // Be polite: let our probe declares itself
        fputs($fp, "User-Agent: phpMyFAQ Link Checker\r\n");
        fputs($fp, "\r\n");
        while (!feof($fp)) {
            $_response .= fread($fp, 4096);
        }
        fclose($fp);

        // parse response
        $code = 0;
        $allowVerbs = 'n/a';
        $location = $url;
        $response = explode("\r\n", $_response);
        $httpStatusMsg = strip_tags($response[count($response) - 1]);

        foreach ($response as $_response) {
            if (preg_match("/^HTTP\/[^ ]+ ([01-9]+) .*$/", $_response, $matches)) {
                $code = $matches[1];
            }
            if (preg_match('/^Location: (.*)$/', $_response, $matches)) {
                $location = $matches[1];
            }
            if (preg_match('/^[a|A]llow: (.*)$/', $_response, $matches)) {
                $allowVerbs = $matches[1];
            }
        }

        // process response code
        switch ($code) {
            // TODO: Add more explicit http status management
            case '200': // OK
                $_reason = ($redirectCount > 0) ? sprintf(
                    $PMF_LANG['ad_linkcheck_openurl_redirected'],
                    Strings::htmlspecialchars($url)
                ) : '';

                return array(true, $redirectCount, $_reason);
                break;
            case '301': // Moved Permanently (go recursive ?)
            case '302': // Found (go recursive ?)
                return $this->openURL($url, $location, $redirectCount + 1);
                break;
            case 400:   // Bad Request
                return array(
                    false,
                    $redirectCount,
                    sprintf($PMF_LANG['ad_linkcheck_openurl_ambiguous'] . '<br>' . $httpStatusMsg, $code)
                );
                break;
            case 404:   // Not found
                return array(
                    false,
                    $redirectCount,
                    sprintf($PMF_LANG['ad_linkcheck_openurl_not_found'], $urlParts['host'])
                );
                break;
            case '300': // Multiple choices
            case '401': // Unauthorized (but it's there. right ?)
                return array(true, $redirectCount, sprintf($PMF_LANG['ad_linkcheck_openurl_ambiguous'], $code));
                break;
            case '405': // Method Not Allowed
                // TODO: Add a fallback to use GET method, otherwise this link should be marked as bad
                return array(
                    true,
                    $redirectCount,
                    sprintf($PMF_LANG['ad_linkcheck_openurl_not_allowed'], $urlParts['host'], $allowVerbs)
                );
                break;
            default:    // All other statuses
                return array(false, $redirectCount, sprintf($PMF_LANG['ad_linkcheck_openurl_ambiguous'], $code));
                break;
        }
    }

    /**
     * logs the current state of link to the specified entry.
     *
     * @param int    $id
     * @param string $faqLang
     * @param string $state   (optional)
     *
     * @return bool true if operation successful, otherwise false
     */
    public function markEntry($id = 0, $faqLang = '', $state = '')
    {
        if (($id < 1) || (trim($faqLang) == '')) {
            return false;
        }

        if ($state == '') {
            $state = $this->getLinkStateString();
        }

        $query = sprintf(
            "
            UPDATE 
                %sfaqdata 
            SET 
                links_state = '%s', links_check_date = %d 
            WHERE 
                id = %d 
            AND 
                lang='%s'",
            Database::getTablePrefix(),
            $state,
            $_SERVER['REQUEST_TIME'],
            $id,
            $faqLang
        );

        if ($this->config->getDb()->query($query)) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * gets the current status string for link check result.
     *
     * "nolinks" - no links were found in contents
     * "linkok"  - link(s) were found and were all ok
     * "linkbad" - link(s) were found and at least one link was broken
     *
     * @result string
     */
    public function getLinkStateString()
    {
        $linkCount = $errorCount = 0;

        foreach ($this->lastResult as $_type => $_value) {
            foreach ($_value as $_url => $value) {
                ++$linkCount;
                if ($value['valid'] == false) {
                    ++$errorCount;
                }
            }
        }

        if (0 === $linkCount) {
            return 'nolinks';
        } else {
            if (0 === $errorCount) {
                return 'linkok';
            } else {
                return 'linkbad';
            }
        }
    }
}