Files
23cm/vendor/behat/gherkin/src/Lexer.php
2026-01-25 18:18:09 +08:00

991 lines
28 KiB
PHP

<?php
/*
* This file is part of the Behat Gherkin Parser.
* (c) Konstantin Kudryashov <ever.zet@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Behat\Gherkin;
use Behat\Gherkin\Dialect\DialectProviderInterface;
use Behat\Gherkin\Dialect\GherkinDialect;
use Behat\Gherkin\Dialect\KeywordsDialectProvider;
use Behat\Gherkin\Exception\LexerException;
use Behat\Gherkin\Exception\NoSuchLanguageException;
use Behat\Gherkin\Keywords\KeywordsInterface;
use LogicException;
use function assert;
/**
* Gherkin lexer.
*
* @author Konstantin Kudryashov <ever.zet@gmail.com>
*
* @final since 4.15.0
*
* @phpstan-type TStepKeyword 'Given'|'When'|'Then'|'And'|'But'
* @phpstan-type TTitleKeyword 'Feature'|'Background'|'Scenario'|'Outline'|'Examples'
* @phpstan-type TTokenType 'Text'|'Comment'|'EOS'|'Newline'|'PyStringOp'|'TableRow'|'Tag'|'Language'|'Step'|TTitleKeyword
* @phpstan-type TToken TStringValueToken|TNullValueToken|TTitleToken|TStepToken|TTagToken|TTableRowToken
* @phpstan-type TStringValueToken array{type: TTokenType, value: string, line: int, deferred: bool}
* @phpstan-type TNullValueToken array{type: TTokenType, value: null, line: int, deferred: bool}
* @phpstan-type TTitleToken array{type: TTitleKeyword, value: null|non-empty-string, line: int, deferred: bool, keyword: string, indent: int}
* @phpstan-type TStepToken array{type: 'Step', value: string, line: int, deferred: bool, keyword_type: string, text: string}
* @phpstan-type TTagToken array{type: 'Tag', value: null, line: int, deferred: bool, tags: list<string>}
* @phpstan-type TTableRowToken array{type: 'TableRow', value: null, line: int, deferred: bool, columns: list<string>}
* @phpstan-type TDocStringSeparator '"""'|'```'
*/
class Lexer
{
/**
* Splits a string around | char, only if it's not preceded by an odd number of \.
*
* @see https://github.com/cucumber/gherkin/blob/679a87e21263699c15ea635159c6cda60f64af3b/php/src/StringGherkinLine.php#L14
*/
private const CELL_PATTERN = '/(?<!\\\\)(?:\\\\{2})*\K\\|/u';
private readonly DialectProviderInterface $dialectProvider;
private GherkinDialect $currentDialect;
private GherkinCompatibilityMode $compatibilityMode = GherkinCompatibilityMode::LEGACY;
/**
* @var list<string>
*/
private array $lines;
private int $linesCount;
private string $line;
private ?string $trimmedLine = null;
private int $lineNumber;
private bool $eos;
/**
* A cache of keyword types associated with each keyword.
*
* @phpstan-var array<string, non-empty-list<TStepKeyword>>|null
*/
private ?array $stepKeywordTypesCache = null;
/**
* @phpstan-var list<TToken>
*/
private array $deferredObjects = [];
private int $deferredObjectsCount = 0;
/**
* @phpstan-var TToken|null
*/
private ?array $stashedToken = null;
private bool $inPyString = false;
private int $pyStringSwallow = 0;
private bool $allowLanguageTag = true;
private bool $allowFeature = true;
private bool $allowMultilineArguments = false;
private bool $allowExamples = false;
private bool $allowSteps = false;
/**
* @phpstan-var TDocStringSeparator|null
*/
private ?string $pyStringDelimiter = null;
public function __construct(
DialectProviderInterface|KeywordsInterface $dialectProvider,
) {
if ($dialectProvider instanceof KeywordsInterface) {
// TODO trigger deprecation
$dialectProvider = new KeywordsDialectProvider($dialectProvider);
}
$this->dialectProvider = $dialectProvider;
}
/**
* @internal
*/
public function setCompatibilityMode(GherkinCompatibilityMode $compatibilityMode): void
{
$this->compatibilityMode = $compatibilityMode;
}
/**
* Sets lexer input.
*
* @param string $input Input string
* @param string $language Language name
*
* @return void
*
* @throws LexerException
*/
public function analyse(string $input, string $language = 'en')
{
// try to detect unsupported encoding
if (mb_detect_encoding($input, 'UTF-8', true) !== 'UTF-8') {
throw new LexerException('Feature file is not in UTF8 encoding');
}
$input = strtr($input, ["\r\n" => "\n", "\r" => "\n"]);
$this->lines = explode("\n", $input);
$this->linesCount = count($this->lines);
$this->line = $this->lines[0];
$this->lineNumber = 1;
$this->trimmedLine = null;
$this->eos = false;
$this->deferredObjects = [];
$this->deferredObjectsCount = 0;
$this->stashedToken = null;
$this->inPyString = false;
$this->pyStringSwallow = 0;
$this->allowLanguageTag = true;
$this->allowFeature = true;
$this->allowMultilineArguments = false;
$this->allowSteps = false;
$this->allowExamples = false;
if (\func_num_args() > 1) {
// @codeCoverageIgnoreStart
\assert($language !== '');
// TODO trigger deprecation (the Parser does not use this code path)
$this->setLanguage($language);
// @codeCoverageIgnoreEnd
} else {
$this->currentDialect = $this->dialectProvider->getDefaultDialect();
$this->stepKeywordTypesCache = null;
}
}
/**
* @param non-empty-string $language
*/
private function setLanguage(string $language): void
{
if (($this->stashedToken !== null) || ($this->deferredObjects !== [])) {
// @codeCoverageIgnoreStart
// It is not possible to trigger this condition using the public interface of this class.
// It may be possible if the end-user has extended the Lexer with custom functionality.
throw new LogicException(
<<<'STRING'
Cannot set gherkin language due to unexpected Lexer state.
Please open an issue at https://github.com/Behat/Gherkin with a copy of the current
feature file. If you are using a Lexer or Parser class that extends the ones provided
in behat/gherkin, please also provide details of these.
STRING,
);
// @codeCoverageIgnoreEnd
}
try {
$this->currentDialect = $this->dialectProvider->getDialect($language);
} catch (NoSuchLanguageException $e) {
if (!$this->compatibilityMode->shouldIgnoreInvalidLanguage()) {
throw $e;
}
}
$this->stepKeywordTypesCache = null;
}
/**
* Returns current lexer language.
*
* @return string
*/
public function getLanguage()
{
return $this->currentDialect->getLanguage();
}
/**
* Returns next token or previously stashed one.
*
* @return array
*
* @phpstan-return TToken
*/
public function getAdvancedToken()
{
return $this->getStashedToken() ?? $this->getNextToken();
}
/**
* Defers token.
*
* @phpstan-param TToken $token Token to defer
*
* @return void
*/
public function deferToken(array $token)
{
$token['deferred'] = true;
$this->deferredObjects[] = $token;
++$this->deferredObjectsCount;
}
/**
* Predicts the upcoming token without passing over it.
*
* @return array
*
* @phpstan-return TToken
*/
public function predictToken()
{
return $this->stashedToken ??= $this->getNextToken();
}
/**
* Skips over the currently-predicted token, if any.
*
* @return void
*/
public function skipPredictedToken()
{
$this->stashedToken = null;
}
/**
* Constructs a token with specified parameters.
*
* @template T of TTokenType
*
* @param string|null $value Token value
*
* @phpstan-param T $type Token type
*
* @return array
*
* @phpstan-return ($value is non-empty-string ? array{type: T, value: non-empty-string, line: int, deferred: bool} : array{type: T, value: null, line: int, deferred: bool})
*/
public function takeToken(string $type, ?string $value = null)
{
return [
'type' => $type,
'line' => $this->lineNumber,
'value' => $value ?: null,
'deferred' => false,
];
}
/**
* Consumes line from input & increments line counter.
*
* @return void
*/
protected function consumeLine()
{
++$this->lineNumber;
if (($this->lineNumber - 1) === $this->linesCount) {
$this->eos = true;
return;
}
$this->line = $this->lines[$this->lineNumber - 1];
$this->trimmedLine = null;
}
/**
* Consumes first part of line from input without incrementing the line number.
*
* @return void
*/
protected function consumeLineUntil(int $trimmedOffset)
{
$this->line = mb_substr(ltrim($this->line), $trimmedOffset, null, 'utf-8');
$this->trimmedLine = null;
}
/**
* Returns trimmed version of line.
*
* @return string
*/
protected function getTrimmedLine()
{
return $this->trimmedLine ??= trim($this->line);
}
/**
* Returns stashed token or null if there isn't one.
*
* @return array|null
*
* @phpstan-return TToken|null
*/
protected function getStashedToken()
{
$stashedToken = $this->stashedToken;
$this->stashedToken = null;
return $stashedToken;
}
/**
* Returns deferred token or null if there isn't one.
*
* @return array|null
*
* @phpstan-return TToken|null
*/
protected function getDeferredToken()
{
if (!$this->deferredObjectsCount) {
return null;
}
--$this->deferredObjectsCount;
return array_shift($this->deferredObjects);
}
/**
* Returns next token from input.
*
* @return array
*
* @phpstan-return TToken
*/
protected function getNextToken()
{
return $this->getDeferredToken()
?? $this->scanEOS()
?? $this->scanLanguage()
?? $this->scanComment()
?? $this->scanPyStringOp()
?? $this->scanPyStringContent()
?? $this->scanStep()
?? $this->scanScenario()
?? $this->scanBackground()
?? $this->scanOutline()
?? $this->scanExamples()
?? $this->scanFeature()
?? $this->scanTags()
?? $this->scanTableRow()
?? $this->scanNewline()
?? $this->scanText();
}
/**
* Scans for token with specified regex.
*
* @param string $regex Regular expression
*
* @phpstan-param TTokenType $type Expected token type
*
* @return array|null
*
* @phpstan-return TStringValueToken|null
*/
protected function scanInput(string $regex, string $type)
{
if (!preg_match($regex, $this->line, $matches)) {
return null;
}
assert($matches[1] !== '');
$token = $this->takeToken($type, $matches[1]);
$this->consumeLine();
return $token;
}
/**
* Scans for token with specified keywords.
*
* @param string $keywords Keywords (separated by "|")
*
* @phpstan-param TTitleKeyword $type Expected token type
*
* @return array|null
*
* @phpstan-return TTitleToken|null
*
* @deprecated
*/
protected function scanInputForKeywords(string $keywords, string $type)
{
// @codeCoverageIgnoreStart
if (!preg_match('/^(\s*)(' . $keywords . '):\s*(.*)/u', $this->line, $matches)) {
return null;
}
$token = $this->takeToken($type, $matches[3]);
$token['keyword'] = $matches[2];
$token['indent'] = mb_strlen($matches[1], 'utf8');
$this->consumeLine();
// turn off language searching and feature detection
if ($type === 'Feature') {
$this->allowFeature = false;
$this->allowLanguageTag = false;
}
// turn off PyString and Table searching
if ($type === 'Feature' || $type === 'Scenario' || $type === 'Outline') {
$this->allowMultilineArguments = false;
} elseif ($type === 'Examples') {
$this->allowMultilineArguments = true;
}
// turn on steps searching
if ($type === 'Scenario' || $type === 'Background' || $type === 'Outline') {
$this->allowSteps = true;
}
return $token;
// @codeCoverageIgnoreEnd
}
/**
* @param list<string> $keywords
*
* @phpstan-param TTitleKeyword $type
*
* @phpstan-return TTitleToken|null
*/
private function scanTitleLine(array $keywords, string $type): ?array
{
$trimmedLine = $this->getTrimmedLine();
foreach ($keywords as $keyword) {
if (str_starts_with($trimmedLine, $keyword . ':')) {
$title = trim(mb_substr($trimmedLine, mb_strlen($keyword) + 1));
$token = $this->takeToken($type, $title);
$token['keyword'] = $keyword;
$token['indent'] = mb_strlen($this->line, 'utf8') - mb_strlen(ltrim($this->line), 'utf8');
$this->consumeLine();
return $token;
}
}
return null;
}
/**
* Scans EOS from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TNullValueToken|null
*/
protected function scanEOS()
{
if (!$this->eos) {
return null;
}
return $this->takeToken('EOS');
}
/**
* Returns a regex matching the keywords for the provided type.
*
* @phpstan-param 'Step'|TTitleKeyword|TStepKeyword $type Keyword type
*
* @return string
*
* @deprecated
*/
protected function getKeywords(string $type)
{
// @codeCoverageIgnoreStart
$keywords = match ($type) {
'Feature' => $this->currentDialect->getFeatureKeywords(),
'Background' => $this->currentDialect->getBackgroundKeywords(),
'Scenario' => $this->currentDialect->getScenarioKeywords(),
'Outline' => $this->currentDialect->getScenarioOutlineKeywords(),
'Examples' => $this->currentDialect->getExamplesKeywords(),
'Step' => $this->currentDialect->getStepKeywords(),
'Given' => $this->currentDialect->getGivenKeywords(),
'When' => $this->currentDialect->getWhenKeywords(),
'Then' => $this->currentDialect->getThenKeywords(),
'And' => $this->currentDialect->getAndKeywords(),
'But' => $this->currentDialect->getButKeywords(),
default => throw new \InvalidArgumentException(sprintf('Unknown keyword type "%s"', $type)),
};
$keywordsRegex = implode('|', array_map(fn ($keyword) => preg_quote($keyword, '/'), $keywords));
if ($type === 'Step') {
$keywordsRegex = '(?:' . $keywordsRegex . ')\s*';
}
return $keywordsRegex;
// @codeCoverageIgnoreEnd
}
/**
* Scans Feature from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TTitleToken|null
*/
protected function scanFeature()
{
if (!$this->allowFeature) {
// The Feature: tag is only allowed once in a file, later in the file it may be part of a description node
return null;
}
$token = $this->scanTitleLine($this->currentDialect->getFeatureKeywords(), 'Feature');
if ($token === null) {
return null;
}
$this->allowFeature = false;
$this->allowLanguageTag = false;
$this->allowMultilineArguments = false;
return $token;
}
/**
* Scans Background from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TTitleToken|null
*/
protected function scanBackground()
{
$token = $this->scanTitleLine($this->currentDialect->getBackgroundKeywords(), 'Background');
if ($token === null) {
return null;
}
$this->allowSteps = true;
return $token;
}
/**
* Scans Scenario from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TTitleToken|null
*/
protected function scanScenario()
{
$token = $this->scanTitleLine($this->currentDialect->getScenarioKeywords(), 'Scenario');
if ($token === null) {
return null;
}
$this->allowMultilineArguments = false;
$this->allowSteps = true;
$this->allowExamples = true;
return $token;
}
/**
* Scans Scenario Outline from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TTitleToken|null
*/
protected function scanOutline()
{
$token = $this->scanTitleLine($this->currentDialect->getScenarioOutlineKeywords(), 'Outline');
if ($token === null) {
return null;
}
$this->allowMultilineArguments = false;
$this->allowSteps = true;
$this->allowExamples = true;
return $token;
}
/**
* Scans Scenario Outline Examples from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TTitleToken|null
*/
protected function scanExamples()
{
if (!$this->allowExamples) {
return null;
}
$token = $this->scanTitleLine($this->currentDialect->getExamplesKeywords(), 'Examples');
if ($token === null) {
return null;
}
$this->allowMultilineArguments = true;
return $token;
}
/**
* Scans Step from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TStepToken|null
*/
protected function scanStep()
{
if (!$this->allowSteps) {
return null;
}
$trimmedLine = $this->getTrimmedLine();
$matchedKeyword = null;
foreach ($this->currentDialect->getStepKeywords() as $keyword) {
if (str_starts_with($trimmedLine, $keyword)) {
$matchedKeyword = $keyword;
break;
}
}
if ($matchedKeyword === null) {
return null;
}
$text = ltrim(mb_substr($trimmedLine, mb_strlen($matchedKeyword)));
$nodeKeyword = $this->compatibilityMode->shouldRemoveStepKeywordSpace() ? trim($matchedKeyword) : $matchedKeyword;
assert($nodeKeyword !== '');
$token = $this->takeToken('Step', $nodeKeyword);
$token['keyword_type'] = $this->getStepKeywordType($matchedKeyword);
$token['text'] = $text;
$this->consumeLine();
$this->allowMultilineArguments = true;
return $token;
}
/**
* Scans PyString from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TNullValueToken|null
*/
protected function scanPyStringOp()
{
if (!$this->allowMultilineArguments) {
return null;
}
if (!preg_match('/^\s*(?<delimiter>"""|```)/u', $this->line, $matches, PREG_OFFSET_CAPTURE)) {
return null;
}
['delimiter' => [0 => $delimiter, 1 => $indent]] = $matches;
if ($this->inPyString) {
if ($this->pyStringDelimiter !== $delimiter) {
return null;
}
$this->pyStringDelimiter = null;
} else {
$this->pyStringDelimiter = $delimiter;
}
$this->inPyString = !$this->inPyString;
$token = $this->takeToken('PyStringOp');
$this->pyStringSwallow = $indent;
$this->consumeLine();
return $token;
}
/**
* Scans PyString content.
*
* @return array|null
*
* @phpstan-return TStringValueToken|null
*/
protected function scanPyStringContent()
{
if (!$this->inPyString) {
return null;
}
$token = $this->scanText();
// swallow trailing spaces
$value = (string) preg_replace('/^\s{0,' . $this->pyStringSwallow . '}/u', '', $token['value'] ?? '');
if ($this->compatibilityMode->shouldUnespaceDocStringDelimiters()) {
\assert($this->pyStringDelimiter !== null);
$escapedDelimiter = match ($this->pyStringDelimiter) {
'"""' => '\\"\\"\\"',
'```' => '\\`\\`\\`',
};
$value = str_replace($escapedDelimiter, $this->pyStringDelimiter, $value);
}
$token['value'] = $value;
return $token;
}
/**
* Scans Table Row from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TTableRowToken|null
*/
protected function scanTableRow()
{
if (!$this->allowMultilineArguments) {
return null;
}
$line = $this->getTrimmedLine();
if (!str_starts_with($line, '|')) {
// Strictly speaking, a table row only has to begin with a pipe - content to the right
// of the final pipe will be ignored after we split the cells.
return null;
}
$rawColumns = preg_split(self::CELL_PATTERN, $line);
assert($rawColumns !== false);
// Safely remove elements before the first and last separators
array_shift($rawColumns);
array_pop($rawColumns);
$token = $this->takeToken('TableRow');
if ($this->compatibilityMode->shouldUseNewTableCellParsing()) {
$columns = array_map($this->parseTableCell(...), $rawColumns);
} else {
$columns = array_map(static fn ($column) => trim(str_replace(['\\|', '\\\\'], ['|', '\\'], $column)), $rawColumns);
}
$token['columns'] = $columns;
$this->consumeLine();
return $token;
}
private function parseTableCell(string $cell): string
{
$trimmedCell = preg_replace('/^[ \\t\\n\\x0B\\f\\r\\x85\\xA0]++|[ \\t\\n\\x0B\\f\\r\\x85\\xA0]++$/u', '', $cell);
\assert($trimmedCell !== null);
$value = preg_replace_callback('/\\\\./', function (array $matches) {
return match ($matches[0]) {
'\\n' => "\n",
'\\\\' => '\\',
'\\|' => '|',
default => $matches[0],
};
}, $trimmedCell);
assert($value !== null);
return $value;
}
/**
* Scans Tags from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TTagToken|null
*/
protected function scanTags()
{
$line = $this->getTrimmedLine();
if ($line === '' || !str_starts_with($line, '@')) {
return null;
}
if (preg_match('/^(?<line>.*)\s+#.*$/', $line, $matches)) {
['line' => $line] = $matches;
$this->consumeLineUntil(mb_strlen($line, 'utf-8'));
} else {
$this->consumeLine();
}
$token = $this->takeToken('Tag');
if ($this->compatibilityMode->shouldRemoveTagPrefixChar()) {
// Legacy behaviour
$tags = explode('@', mb_substr($line, 1, mb_strlen($line, 'utf8') - 1, 'utf8'));
$tags = array_map(trim(...), $tags);
$token['tags'] = $tags;
return $token;
}
$tags = preg_split('/(?=@)/u', $line);
assert($tags !== false);
// Remove the empty content before the first tag prefix
array_shift($tags);
// Note: checking for whitespace in tags is done in the Parser to fit with existing logic
$token['tags'] = array_map(trim(...), $tags);
return $token;
}
/**
* Scans Language specifier from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TStringValueToken|null
*/
protected function scanLanguage()
{
if (!$this->allowLanguageTag) {
return null;
}
if ($this->inPyString) {
return null;
}
if (!str_starts_with(ltrim($this->line), '#')) {
return null;
}
$pattern = $this->compatibilityMode->allowWhitespaceInLanguageTag()
? '/^\s*#\s*language\s*:\s*([\w_\-]+)\s*$/u'
: '/^\s*#\s*language:\s*([\w_\-]+)\s*$/';
$token = $this->scanInput($pattern, 'Language');
if ($token) {
\assert(\is_string($token['value']));
\assert($token['value'] !== ''); // the regex can only match a non-empty value.
$this->allowLanguageTag = false;
$this->setLanguage($token['value']);
}
return $token;
}
/**
* Scans Comment from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TStringValueToken|null
*/
protected function scanComment()
{
if ($this->inPyString) {
return null;
}
$line = $this->getTrimmedLine();
if (!str_starts_with($line, '#')) {
return null;
}
$token = $this->takeToken('Comment', $line);
$this->consumeLine();
return $token;
}
/**
* Scans Newline from input & returns it if found.
*
* @return array|null
*
* @phpstan-return TNullValueToken|null
*/
protected function scanNewline()
{
if ($this->getTrimmedLine() !== '') {
return null;
}
$token = $this->takeToken('Newline');
$this->consumeLine();
return $token;
}
/**
* Scans text from input & returns it if found.
*
* @return array
*
* @phpstan-return TStringValueToken|TNullValueToken
*/
protected function scanText()
{
$token = $this->takeToken('Text', $this->line);
$this->consumeLine();
return $token;
}
/**
* Returns step type keyword (Given, When, Then, etc.).
*
* @param string $native Step keyword in provided language
*
* @phpstan-return TStepKeyword
*/
private function getStepKeywordType(string $native): string
{
if ($this->stepKeywordTypesCache === null) {
$this->stepKeywordTypesCache = [];
$this->addStepKeywordTypes($this->currentDialect->getGivenKeywords(), 'Given');
$this->addStepKeywordTypes($this->currentDialect->getWhenKeywords(), 'When');
$this->addStepKeywordTypes($this->currentDialect->getThenKeywords(), 'Then');
$this->addStepKeywordTypes($this->currentDialect->getAndKeywords(), 'And');
$this->addStepKeywordTypes($this->currentDialect->getButKeywords(), 'But');
}
if (!isset($this->stepKeywordTypesCache[$native])) { // should not happen when the native keyword belongs to the dialect
return 'Given'; // cucumber/gherkin has an UNKNOWN type, but we don't have it.
}
if (\count($this->stepKeywordTypesCache[$native]) === 1) {
return $this->stepKeywordTypesCache[$native][0];
}
// Consider ambiguous keywords as AND keywords so that they are normalized to the previous step type.
// This happens in English for the `* ` keyword for instance.
// cucumber/gherkin returns that as an UNKNOWN type, but we don't have it.
return 'And';
}
/**
* @param list<string> $keywords
*
* @phpstan-param TStepKeyword $type
*/
private function addStepKeywordTypes(array $keywords, string $type): void
{
foreach ($keywords as $keyword) {
$this->stepKeywordTypesCache[$keyword][] = $type;
}
}
}