117 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
		
		
			
		
	
	
			117 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| 
								 | 
							
								<?php declare(strict_types=1);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								namespace PhpParser;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								require __DIR__ . '/compatibility_tokens.php';
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class Lexer {
							 | 
						||
| 
								 | 
							
								    /**
							 | 
						||
| 
								 | 
							
								     * Tokenize the provided source code.
							 | 
						||
| 
								 | 
							
								     *
							 | 
						||
| 
								 | 
							
								     * The token array is in the same format as provided by the PhpToken::tokenize() method in
							 | 
						||
| 
								 | 
							
								     * PHP 8.0. The tokens are instances of PhpParser\Token, to abstract over a polyfill
							 | 
						||
| 
								 | 
							
								     * implementation in earlier PHP version.
							 | 
						||
| 
								 | 
							
								     *
							 | 
						||
| 
								 | 
							
								     * The token array is terminated by a sentinel token with token ID 0.
							 | 
						||
| 
								 | 
							
								     * The token array does not discard any tokens (i.e. whitespace and comments are included).
							 | 
						||
| 
								 | 
							
								     * The token position attributes are against this token array.
							 | 
						||
| 
								 | 
							
								     *
							 | 
						||
| 
								 | 
							
								     * @param string $code The source code to tokenize.
							 | 
						||
| 
								 | 
							
								     * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to
							 | 
						||
| 
								 | 
							
								     *                                        ErrorHandler\Throwing.
							 | 
						||
| 
								 | 
							
								     * @return Token[] Tokens
							 | 
						||
| 
								 | 
							
								     */
							 | 
						||
| 
								 | 
							
								    public function tokenize(string $code, ?ErrorHandler $errorHandler = null): array {
							 | 
						||
| 
								 | 
							
								        if (null === $errorHandler) {
							 | 
						||
| 
								 | 
							
								            $errorHandler = new ErrorHandler\Throwing();
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        $scream = ini_set('xdebug.scream', '0');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        $tokens = @Token::tokenize($code);
							 | 
						||
| 
								 | 
							
								        $this->postprocessTokens($tokens, $errorHandler);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if (false !== $scream) {
							 | 
						||
| 
								 | 
							
								            ini_set('xdebug.scream', $scream);
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return $tokens;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function handleInvalidCharacter(Token $token, ErrorHandler $errorHandler): void {
							 | 
						||
| 
								 | 
							
								        $chr = $token->text;
							 | 
						||
| 
								 | 
							
								        if ($chr === "\0") {
							 | 
						||
| 
								 | 
							
								            // PHP cuts error message after null byte, so need special case
							 | 
						||
| 
								 | 
							
								            $errorMsg = 'Unexpected null byte';
							 | 
						||
| 
								 | 
							
								        } else {
							 | 
						||
| 
								 | 
							
								            $errorMsg = sprintf(
							 | 
						||
| 
								 | 
							
								                'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
							 | 
						||
| 
								 | 
							
								            );
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        $errorHandler->handleError(new Error($errorMsg, [
							 | 
						||
| 
								 | 
							
								            'startLine' => $token->line,
							 | 
						||
| 
								 | 
							
								            'endLine' => $token->line,
							 | 
						||
| 
								 | 
							
								            'startFilePos' => $token->pos,
							 | 
						||
| 
								 | 
							
								            'endFilePos' => $token->pos,
							 | 
						||
| 
								 | 
							
								        ]));
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function isUnterminatedComment(Token $token): bool {
							 | 
						||
| 
								 | 
							
								        return $token->is([\T_COMMENT, \T_DOC_COMMENT])
							 | 
						||
| 
								 | 
							
								            && substr($token->text, 0, 2) === '/*'
							 | 
						||
| 
								 | 
							
								            && substr($token->text, -2) !== '*/';
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    /**
							 | 
						||
| 
								 | 
							
								     * @param list<Token> $tokens
							 | 
						||
| 
								 | 
							
								     */
							 | 
						||
| 
								 | 
							
								    protected function postprocessTokens(array &$tokens, ErrorHandler $errorHandler): void {
							 | 
						||
| 
								 | 
							
								        // This function reports errors (bad characters and unterminated comments) in the token
							 | 
						||
| 
								 | 
							
								        // array, and performs certain canonicalizations:
							 | 
						||
| 
								 | 
							
								        //  * Use PHP 8.1 T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG and
							 | 
						||
| 
								 | 
							
								        //    T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG tokens used to disambiguate intersection types.
							 | 
						||
| 
								 | 
							
								        //  * Add a sentinel token with ID 0.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        $numTokens = \count($tokens);
							 | 
						||
| 
								 | 
							
								        if ($numTokens === 0) {
							 | 
						||
| 
								 | 
							
								            // Empty input edge case: Just add the sentinel token.
							 | 
						||
| 
								 | 
							
								            $tokens[] = new Token(0, "\0", 1, 0);
							 | 
						||
| 
								 | 
							
								            return;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        for ($i = 0; $i < $numTokens; $i++) {
							 | 
						||
| 
								 | 
							
								            $token = $tokens[$i];
							 | 
						||
| 
								 | 
							
								            if ($token->id === \T_BAD_CHARACTER) {
							 | 
						||
| 
								 | 
							
								                $this->handleInvalidCharacter($token, $errorHandler);
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            if ($token->id === \ord('&')) {
							 | 
						||
| 
								 | 
							
								                $next = $i + 1;
							 | 
						||
| 
								 | 
							
								                while (isset($tokens[$next]) && $tokens[$next]->id === \T_WHITESPACE) {
							 | 
						||
| 
								 | 
							
								                    $next++;
							 | 
						||
| 
								 | 
							
								                }
							 | 
						||
| 
								 | 
							
								                $followedByVarOrVarArg = isset($tokens[$next]) &&
							 | 
						||
| 
								 | 
							
								                    $tokens[$next]->is([\T_VARIABLE, \T_ELLIPSIS]);
							 | 
						||
| 
								 | 
							
								                $token->id = $followedByVarOrVarArg
							 | 
						||
| 
								 | 
							
								                    ? \T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG
							 | 
						||
| 
								 | 
							
								                    : \T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG;
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        // Check for unterminated comment
							 | 
						||
| 
								 | 
							
								        $lastToken = $tokens[$numTokens - 1];
							 | 
						||
| 
								 | 
							
								        if ($this->isUnterminatedComment($lastToken)) {
							 | 
						||
| 
								 | 
							
								            $errorHandler->handleError(new Error('Unterminated comment', [
							 | 
						||
| 
								 | 
							
								                'startLine' => $lastToken->line,
							 | 
						||
| 
								 | 
							
								                'endLine' => $lastToken->getEndLine(),
							 | 
						||
| 
								 | 
							
								                'startFilePos' => $lastToken->pos,
							 | 
						||
| 
								 | 
							
								                'endFilePos' => $lastToken->getEndPos(),
							 | 
						||
| 
								 | 
							
								            ]));
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        // Add sentinel token.
							 | 
						||
| 
								 | 
							
								        $tokens[] = new Token(0, "\0", $lastToken->getEndLine(), $lastToken->getEndPos());
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 |