Skip to content

Commit

Permalink
update how concurrent blocks are calculated
Browse files Browse the repository at this point in the history
  • Loading branch information
lrq619 committed Oct 9, 2024
1 parent 4f5027d commit a94e5ba
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
# current_cache_usage = self.get_latest_metrics().gpu_cache_usage
metrics = self.get_latest_metrics()
num_using_gpu_blocks = self.cache_config.num_gpu_blocks - self.scheduler.block_manager.get_num_free_gpu_blocks()
num_waiting_blocks = self.scheduler.get_waiting_num_tokens()
num_waiting_blocks = self.scheduler.get_waiting_num_tokens() // self.cache_config.block_size
num_concurrent_blocks = num_using_gpu_blocks + num_waiting_blocks
concurrent_cache_usage = num_concurrent_blocks / self.cache_config.num_gpu_blocks
liquid_request = None
Expand Down

0 comments on commit a94e5ba

Please sign in to comment.