Fix error when creating short URL for page with unsupported encoding

This commit is contained in:
Alejandro Celaya 2025-01-28 10:04:30 +01:00
parent 88e97f18ad
commit e9fe1ac5d4
4 changed files with 112 additions and 13 deletions

View File

@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
### Fixed
* [#2341](https://github.com/shlinkio/shlink/issues/2341) Ensure all asynchronous jobs that interact with the database do not leave idle connections open.
* [#2334](https://github.com/shlinkio/shlink/issues/2334) Improve how page titles are encoded to UTF-8, falling back from mbstring to iconv if available, and ultimately using the original title in case of error, but never causing the short URL creation to fail.
# [4.4.0] - 2024-12-27

View File

@ -227,6 +227,7 @@ return [
ShortUrl\Helper\ShortUrlTitleResolutionHelper::class => [
'httpClient',
Config\Options\UrlShortenerOptions::class,
'Logger_Shlink',
],
ShortUrl\Helper\ShortUrlRedirectionBuilder::class => [
Config\Options\TrackingOptions::class,

View File

@ -4,14 +4,19 @@ declare(strict_types=1);
namespace Shlinkio\Shlink\Core\ShortUrl\Helper;
use Closure;
use Fig\Http\Message\RequestMethodInterface;
use GuzzleHttp\ClientInterface;
use GuzzleHttp\RequestOptions;
use Laminas\Stdlib\ErrorHandler;
use Psr\Http\Message\ResponseInterface;
use Psr\Log\LoggerInterface;
use Shlinkio\Shlink\Core\Config\Options\UrlShortenerOptions;
use Throwable;
use function function_exists;
use function html_entity_decode;
use function iconv;
use function mb_convert_encoding;
use function preg_match;
use function str_contains;
@ -30,9 +35,14 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
// Matches the charset inside a Content-Type header
private const string CHARSET_VALUE = '/charset=([^;]+)/i';
/**
* @param (Closure(): bool)|null $isIconvInstalled
*/
public function __construct(
private ClientInterface $httpClient,
private UrlShortenerOptions $options,
private LoggerInterface $logger,
private Closure|null $isIconvInstalled = null,
) {
}
@ -58,7 +68,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
}
$title = $this->tryToResolveTitle($response, $contentType);
return $title !== null ? $data->withResolvedTitle($title) : $data;
return $title !== null ? $data->withResolvedTitle(html_entity_decode(trim($title))) : $data;
}
private function fetchUrl(string $url): ResponseInterface|null
@ -84,6 +94,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
{
$collectedBody = '';
$body = $response->getBody();
// With streaming enabled, we can walk the body until the </title> tag is found, and then stop
while (! str_contains($collectedBody, '</title>') && ! $body->eof()) {
$collectedBody .= $body->read(1024);
@ -95,12 +106,48 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
return null;
}
// Get the page's charset from Content-Type header
preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
$titleInOriginalEncoding = $titleMatches[1];
$title = isset($charsetMatches[1])
? mb_convert_encoding($titleMatches[1], 'utf8', $charsetMatches[1])
: $titleMatches[1];
return html_entity_decode(trim($title));
// Get the page's charset from Content-Type header, or return title as is if not found
preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
if (! isset($charsetMatches[1])) {
return $titleInOriginalEncoding;
}
$pageCharset = $charsetMatches[1];
return $this->encodeToUtf8WithMbString($titleInOriginalEncoding, $pageCharset)
?? $this->encodeToUtf8WithIconv($titleInOriginalEncoding, $pageCharset)
?? $titleInOriginalEncoding;
}
private function encodeToUtf8WithMbString(string $titleInOriginalEncoding, string $pageCharset): string|null
{
try {
return mb_convert_encoding($titleInOriginalEncoding, 'utf-8', $pageCharset);
} catch (Throwable $e) {
$this->logger->warning('It was impossible to encode page title in UTF-8 with mb_convert_encoding. {e}', [
'e' => $e,
]);
return null;
}
}
private function encodeToUtf8WithIconv(string $titleInOriginalEncoding, string $pageCharset): string|null
{
$isIconvInstalled = ($this->isIconvInstalled ?? fn () => function_exists('iconv'))();
if (! $isIconvInstalled) {
$this->logger->warning('Missing iconv extension. Skipping title encoding');
return null;
}
try {
ErrorHandler::start();
$title = iconv($pageCharset, 'utf-8', $titleInOriginalEncoding);
ErrorHandler::stop(throw: true);
return $title ?: null;
} catch (Throwable $e) {
$this->logger->warning('It was impossible to encode page title in UTF-8 with iconv. {e}', ['e' => $e]);
return null;
}
}
}

View File

@ -16,6 +16,7 @@ use PHPUnit\Framework\Attributes\TestWith;
use PHPUnit\Framework\MockObject\Builder\InvocationMocker;
use PHPUnit\Framework\MockObject\MockObject;
use PHPUnit\Framework\TestCase;
use Psr\Log\LoggerInterface;
use Shlinkio\Shlink\Core\Config\Options\UrlShortenerOptions;
use Shlinkio\Shlink\Core\ShortUrl\Helper\ShortUrlTitleResolutionHelper;
use Shlinkio\Shlink\Core\ShortUrl\Model\ShortUrlCreation;
@ -25,10 +26,12 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
private const string LONG_URL = 'http://foobar.com/12345/hello?foo=bar';
private MockObject & ClientInterface $httpClient;
private MockObject & LoggerInterface $logger;
protected function setUp(): void
{
$this->httpClient = $this->createMock(ClientInterface::class);
$this->logger = $this->createMock(LoggerInterface::class);
}
#[Test]
@ -90,14 +93,59 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
}
#[Test]
#[TestWith(['TEXT/html; charset=utf-8'], 'charset')]
#[TestWith(['TEXT/html'], 'no charset')]
public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType): void
#[TestWith(['TEXT/html', false], 'no charset')]
#[TestWith(['TEXT/html; charset=utf-8', false], 'mbstring-supported charset')]
#[TestWith(['TEXT/html; charset=Windows-1255', true], 'mbstring-unsupported charset')]
public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType, bool $expectsWarning): void
{
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
$this->expectRequestToBeCalled()->willReturn($this->respWithTitle($contentType));
if ($expectsWarning) {
$this->logger->expects($this->once())->method('warning')->with(
'It was impossible to encode page title in UTF-8 with mb_convert_encoding. {e}',
$this->isArray(),
);
} else {
$this->logger->expects($this->never())->method('warning');
}
$result = $this->helper(autoResolveTitles: true)->processTitle($data);
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
$result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data);
self::assertNotSame($data, $result);
self::assertEquals('Resolved "title"', $result->title);
}
#[Test]
#[TestWith([
'contentType' => 'text/html; charset=Windows-1255',
'iconvEnabled' => false,
'expectedSecondMessage' => 'Missing iconv extension. Skipping title encoding',
])]
#[TestWith([
'contentType' => 'text/html; charset=foo',
'iconvEnabled' => true,
'expectedSecondMessage' => 'It was impossible to encode page title in UTF-8 with iconv. {e}',
])]
public function warningsLoggedWhenTitleCannotBeEncodedToUtf8(
string $contentType,
bool $iconvEnabled,
string $expectedSecondMessage,
): void {
$this->expectRequestToBeCalled()->willReturn($this->respWithTitle($contentType));
$callCount = 0;
$this->logger->expects($this->exactly(2))->method('warning')->with($this->callback(
function (string $message) use (&$callCount, $expectedSecondMessage): bool {
$callCount++;
if ($callCount === 1) {
return $message === 'It was impossible to encode page title in UTF-8 with mb_convert_encoding. {e}';
}
return $message === $expectedSecondMessage;
},
));
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
$result = $this->helper(autoResolveTitles: true, iconvEnabled: $iconvEnabled)->processTitle($data);
self::assertNotSame($data, $result);
self::assertEquals('Resolved "title"', $result->title);
@ -143,11 +191,13 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
return $body;
}
private function helper(bool $autoResolveTitles = false): ShortUrlTitleResolutionHelper
private function helper(bool $autoResolveTitles = false, bool $iconvEnabled = false): ShortUrlTitleResolutionHelper
{
return new ShortUrlTitleResolutionHelper(
$this->httpClient,
new UrlShortenerOptions(autoResolveTitles: $autoResolveTitles),
$this->logger,
fn () => $iconvEnabled,
);
}
}