autoprogrammer
/

dream_rcr

@@ -1,18 +1,5 @@
 # coding=utf-8
-# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# You may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import warnings
 import copy
 from dataclasses import dataclass
@@ -34,10 +21,8 @@ def top_p_logits(logits, top_p=None):
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
     cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
     sorted_indices_to_remove = cumulative_probs > top_p
-    # Shift the indices to the right to keep the first token above the threshold
     sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
     sorted_indices_to_remove[..., 0] = 0
     mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
     mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
     logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
@@ -47,10 +32,9 @@ def top_p_logits(logits, top_p=None):
 def top_k_logits(logits, top_k=None):
     if top_k is None:
         return logits
-    top_k = int(min(top_k, logits.size(-1)))  # Safety check
     if top_k <= 0:
         return logits
-    # Remove all tokens with a probability less than the last token of the top-k
     indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
     logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
     return logits
@@ -85,7 +69,7 @@ def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confid
         confidence = top1_probs - top2_probs
     if neg_entropy:
-        # 负熵（数值 ≤ 0；越接近 0 越大代表越确定）
         epsilon = 1e-10
         log_probs = torch.log(probs + epsilon)
         confidence = torch.sum(probs * log_probs, dim=-1)
@@ -116,12 +100,11 @@ class DreamGenerationConfig(GenerationConfig):
         self.alg: str = kwargs.pop("alg", 'origin')  # 'origin' | 'maskgit_plus' | 'topk_margin' | 'entropy'
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
-        # === RCR 参数（新增，默认关闭，不影响原逻辑） ===
         self.rcr: bool = kwargs.pop("rcr", False)
-        # 仅在 rcr=True 时用于选择置信度算法；rcr=False 不读取它
         self.conf_alg: str = kwargs.pop("conf_alg", 'maskgit_plus')
-        # generate outputs
         self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
         self.return_dict_in_generate: bool = kwargs.pop("return_dict_in_generate", False)
         self.output_history: bool = kwargs.pop("output_history", False)
@@ -169,85 +152,91 @@ class DreamGenerationMixin:
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
-    # === 新版：RCR 核心（历史置信度） ===
     def _apply_rcr_logic(
         self,
         x: torch.Tensor,
         x0: torch.Tensor,
-        conf_now: torch.Tensor,
-        mask_index: torch.Tensor,
-        fixed_conf: torch.Tensor,
-        gen_mask: torch.Tensor,
-        init_mask_count: torch.Tensor,
         mask_token_id: int,
         step: int,
         total_steps: int,
         s: torch.Tensor,
         t: torch.Tensor,
     ):
         """
-        Running Confidence Remasking（历史置信度版）：
-          1) 在 mask 子集内以当步置信度 conf_now 选择 top-k_j 个位置“确认”（写 token）；
-          2) 更新历史置信度 fixed_conf = max(fixed_conf, conf_now)（仅对新选入位置）；
-          3) 按“累计允许确认配额” target_cum = init_mask_count * (1 - s/t) 若超额，
-             在已确认集合 gen_mask 内按 fixed_conf 最低回遮 over 个位置。
-        说明：
-          - conf_now 用 float32 维护，避免与 bfloat16 混写导致 dtype 报错；
-          - 对 entropy：conf_now = 负熵（≤0 且越接近 0 越大代表越确定），配合 topk(largest=True) 没问题。
         """
         device = x.device
         B, L = x.shape
-        # 计算“当步”选入规模（与 vanilla 同口径：平均剩余 mask * (1 - s/t)）
         avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
         ratio = (1.0 - (s.item() / t.item())) if step < total_steps - 1 else 1.0
         number_transfer_tokens = int(avg_mask_now * ratio)
-        # 确保当步置信度是 float32
-        conf_now = conf_now.to(torch.float32)
-        # 仅在 mask 处有效的“全长”视图
-        full_conf_now = torch.full((B, L), float("-inf"), dtype=torch.float32, device=device)
         full_x0 = torch.full((B, L), mask_token_id, dtype=torch.long, device=device)
         full_conf_now[mask_index] = conf_now
         full_x0[mask_index] = x0
-        # 逐样本处理
         for j in range(B):
             masked_j = int(mask_index[j].sum().item())
             k_j = min(number_transfer_tokens, masked_j)
             if k_j > 0:
                 conf_row = full_conf_now[j]  # float32
-                # 选当步 top-k_j
                 _, sel_idx = torch.topk(conf_row, k=k_j, largest=True)
-                # 写 token & 标记确认
                 x[j, sel_idx] = full_x0[j, sel_idx]
                 gen_mask[j, sel_idx] = True
-                # 历史置信度取 running max
                 fixed_conf[j, sel_idx] = torch.maximum(fixed_conf[j, sel_idx], conf_row[sel_idx])
-            # 累计允许确认配额（以初始 mask 为基数）
             init_m = int(init_mask_count[j].item())
-            if step < total_steps - 1:
-                target_cum = int(init_m * (1.0 - (s.item() / t.item())))
-            else:
-                target_cum = init_m  # 最后一步允许全确认
             current_gen = int(gen_mask[j].sum().item())
             over = max(0, current_gen - target_cum)
             if over > 0:
-                # 在已确认集合里按历史置信度最低回遮
                 gen_idx = torch.where(gen_mask[j])[0]
                 if gen_idx.numel() > 0:
-                    hist_vals = fixed_conf[j, gen_idx]  # float32
-                    over = min(over, int(gen_idx.numel()))
-                    _, low_local = torch.topk(hist_vals, k=over, largest=False)
-                    low_global = gen_idx[low_local]
-                    # 回遮：恢复为 MASK，并撤销确认标记 & 清空历史置信度
-                    x[j, low_global] = mask_token_id
-                    gen_mask[j, low_global] = False
-                    fixed_conf[j, low_global] = float("-inf")
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         if is_torchdynamo_compiling():
@@ -363,10 +352,7 @@ class DreamGenerationMixin:
             warnings.warn(
                 "You are calling .generate() with the `input_ids` being on a device type different"
                 f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
-                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
-                " Please make sure that you have put `input_ids` to the"
-                f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
-                " running `.generate()`.",
                 UserWarning,
             )
         if (
@@ -403,7 +389,6 @@ class DreamGenerationMixin:
         generation_tokens_hook_func,
         generation_logits_hook_func
     ) -> Union[DreamModelOutput, torch.LongTensor]:
-        # === 基本变量 ===
         output_history = generation_config.output_history
         return_dict_in_generate = generation_config.return_dict_in_generate
         max_length = generation_config.max_length
@@ -416,7 +401,7 @@ class DreamGenerationMixin:
         top_p = generation_config.top_p
         top_k = generation_config.top_k
-        # === RCR 控制变量 ===
         rcr = generation_config.rcr
         conf_alg = generation_config.conf_alg
@@ -439,27 +424,26 @@ class DreamGenerationMixin:
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
-        # === RCR 缓冲（仅 rcr=True 时启用） ===
         if rcr:
-            init_mask_count = (x == mask_token_id).sum(dim=1)  # [B]
-            fixed_conf = torch.full(
-                x.shape, float("-inf"), dtype=torch.float32, device=x.device
-            )  # 历史置信度
-            gen_mask = torch.zeros_like(x, dtype=torch.bool)    # 已确认集合
         else:
             init_mask_count = None
             fixed_conf = None
             gen_mask = None
-        # hooks：允许用户中间控制
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
             mask_index = (x == mask_token_id)
             logits = self(x, attention_mask, tok_idx).logits
-            # 右移一位（Dream 原实现）
             logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
             logits = generation_logits_hook_func(i, x, logits)
             mask_logits = logits[mask_index]
@@ -467,7 +451,6 @@ class DreamGenerationMixin:
             s = timesteps[i + 1]
             if alg == 'origin':
-                # === 原版 origin：随机按比例转移（不涉及置信度） ===
                 p_transfer = 1 - s / t if i < steps - 1 else 1
                 x0 = torch.zeros_like(x[mask_index], device=self.device, dtype=torch.long) + mask_token_id
                 transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
@@ -476,7 +459,6 @@ class DreamGenerationMixin:
                 )
                 x[mask_index] = x0.clone()
             else:
-                # === 置信度算法选择（vanilla 与 RCR 复用此处） ===
                 use_alg = conf_alg if rcr else alg
                 if use_alg == 'maskgit_plus':
                     confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
@@ -492,23 +474,25 @@ class DreamGenerationMixin:
                     raise RuntimeError(f"Unknown alg/conf_alg: {use_alg}")
                 if rcr:
-                    # === 历史置信度版 RCR ===
                     self._apply_rcr_logic(
                         x=x,
                         x0=x0,
-                        conf_now=confidence,
                         mask_index=mask_index,
                         fixed_conf=fixed_conf,
                         gen_mask=gen_mask,
                         init_mask_count=init_mask_count,
                         mask_token_id=mask_token_id,
                         step=i,
                         total_steps=steps,
                         s=s, t=t,
                     )
                 else:
-                    # === 原版 Dream（vanilla）：本步 top-k，永久确认，不回遮 ===
-                    # number_transfer_tokens 基于“当前平均剩余 mask * (1 - s/t)”
                     avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
                     ratio = (1.0 - (s.item() / t.item())) if i < steps - 1 else 1.0
                     number_transfer_tokens = int(avg_mask_now * ratio)
@@ -529,14 +513,10 @@ class DreamGenerationMixin:
                         x[row_indices, transfer_index] = x_[row_indices, transfer_index]
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None:
                 histories.append(x.clone())
         if return_dict_in_generate:
-            return DreamModelOutput(
-                sequences=x,
-                history=histories,
-            )
         else:
             return x

 # coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and...
 import warnings
 import copy
 from dataclasses import dataclass
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
     cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
     sorted_indices_to_remove = cumulative_probs > top_p
     sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
     sorted_indices_to_remove[..., 0] = 0
     mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
     mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
     logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
 def top_k_logits(logits, top_k=None):
     if top_k is None:
         return logits
+    top_k = int(min(top_k, logits.size(-1)))
     if top_k <= 0:
         return logits
     indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
     logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
     return logits
         confidence = top1_probs - top2_probs
     if neg_entropy:
+        # 负熵（≤0；越接近 0 越“确定”）
         epsilon = 1e-10
         log_probs = torch.log(probs + epsilon)
         confidence = torch.sum(probs * log_probs, dim=-1)
         self.alg: str = kwargs.pop("alg", 'origin')  # 'origin' | 'maskgit_plus' | 'topk_margin' | 'entropy'
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
+        # RCR
         self.rcr: bool = kwargs.pop("rcr", False)
         self.conf_alg: str = kwargs.pop("conf_alg", 'maskgit_plus')
+        # outputs
         self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
         self.return_dict_in_generate: bool = kwargs.pop("return_dict_in_generate", False)
         self.output_history: bool = kwargs.pop("output_history", False)
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
+    # =========================
+    # 历史置信度 RCR（贴近 vanilla）
+    # =========================
     def _apply_rcr_logic(
         self,
         x: torch.Tensor,
         x0: torch.Tensor,
+        conf_now: torch.Tensor,          # [M] 仅 mask 位置的置信度（已为 float32）
+        mask_index: torch.Tensor,        # [B, L] bool
+        fixed_conf: torch.Tensor,        # [B, L] float32（历史 max）
+        ema_conf: torch.Tensor,          # [B, L] float32（EMA）
+        gen_mask: torch.Tensor,          # [B, L] bool（已确认集合）
+        written_step: torch.Tensor,      # [B, L] int32（写入的步骤，-1=未写）
+        init_mask_count: torch.Tensor,   # [B] 初始 mask 数
         mask_token_id: int,
         step: int,
         total_steps: int,
         s: torch.Tensor,
         t: torch.Tensor,
+        ema_beta: float = 0.8            # EMA 平滑系数（越大越稳定）
     ):
         """
+        策略要点（接近 vanilla）：
+          1) 当步确认：沿用 vanilla 配额计算，按 conf_now（负熵/概率差等）选 top-k 写入；
+          2) 历史维护：fixed_conf 取历史 max；ema_conf 做滑动平均，写入步 recorded；
+          3) 超额回遮：若当前已确认数 > 目标累计配额，仅在 gen_mask 内、且不是“本步刚写”的位置，
+             选 EMA 最低的 over 个回遮（轻量、稳定）。
         """
         device = x.device
         B, L = x.shape
+        # 1) 配额（与 vanilla 一致）
         avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
         ratio = (1.0 - (s.item() / t.item())) if step < total_steps - 1 else 1.0
         number_transfer_tokens = int(avg_mask_now * ratio)
+        # 把当步局部置信度/候选整到全长
+        full_conf_now = torch.full((B, L), -1e9, dtype=torch.float32, device=device)  # 用 -1e9 更稳妥
         full_x0 = torch.full((B, L), mask_token_id, dtype=torch.long, device=device)
         full_conf_now[mask_index] = conf_now
         full_x0[mask_index] = x0
+        # 2) 逐样本选择当步 top-k
         for j in range(B):
             masked_j = int(mask_index[j].sum().item())
             k_j = min(number_transfer_tokens, masked_j)
             if k_j > 0:
                 conf_row = full_conf_now[j]  # float32
                 _, sel_idx = torch.topk(conf_row, k=k_j, largest=True)
+                # 写入
                 x[j, sel_idx] = full_x0[j, sel_idx]
                 gen_mask[j, sel_idx] = True
+                # 历史 max & EMA（仅对当步写入位置更新）
                 fixed_conf[j, sel_idx] = torch.maximum(fixed_conf[j, sel_idx], conf_row[sel_idx])
+                ema_conf[j, sel_idx] = ema_beta * ema_conf[j, sel_idx] + (1 - ema_beta) * conf_row[sel_idx]
+                written_step[j, sel_idx] = step
+            # 3) 目标累计配额（与 vanilla 同口径）
             init_m = int(init_mask_count[j].item())
+            target_cum = init_m if step >= total_steps - 1 else int(init_m * (1.0 - (s.item() / t.item())))
             current_gen = int(gen_mask[j].sum().item())
             over = max(0, current_gen - target_cum)
             if over > 0:
+                # 只能从“非本步写入”的已确认里回遮，避免抖动
                 gen_idx = torch.where(gen_mask[j])[0]
                 if gen_idx.numel() > 0:
+                    # 排除刚写入的
+                    not_just_written = written_step[j, gen_idx] < step
+                    candidates = gen_idx[not_just_written]
+                    if candidates.numel() > 0:
+                        over = min(over, int(candidates.numel()))
+                        cand_ema = ema_conf[j, candidates]  # float32
+                        _, low_local = torch.topk(cand_ema, k=over, largest=False)
+                        low_global = candidates[low_local]
+                        # 回遮
+                        x[j, low_global] = mask_token_id
+                        gen_mask[j, low_global] = False
+                        # 适度清理 EMA，max 保留帮助后续稳定
+                        ema_conf[j, low_global] = 0.0
+                        written_step[j, low_global] = -1  # 重置写入步
+                        # fixed_conf 不清零，保留历史峰值作为“锚”信息
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         if is_torchdynamo_compiling():
             warnings.warn(
                 "You are calling .generate() with the `input_ids` being on a device type different"
                 f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
+                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation.",
                 UserWarning,
             )
         if (
         generation_tokens_hook_func,
         generation_logits_hook_func
     ) -> Union[DreamModelOutput, torch.LongTensor]:
         output_history = generation_config.output_history
         return_dict_in_generate = generation_config.return_dict_in_generate
         max_length = generation_config.max_length
         top_p = generation_config.top_p
         top_k = generation_config.top_k
+        # RCR
         rcr = generation_config.rcr
         conf_alg = generation_config.conf_alg
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
+        # ===== RCR 缓冲初始化（关键：float32，避免 dtype 冲突） =====
         if rcr:
+            init_mask_count = (x == mask_token_id).sum(dim=1)                  # [B]
+            fixed_conf = torch.full(x.shape, -1e9, dtype=torch.float32, device=x.device)  # 历史 max
+            ema_conf   = torch.zeros_like(fixed_conf, dtype=torch.float32)     # EMA
+            gen_mask   = torch.zeros_like(x, dtype=torch.bool)                 # 已确认集合
+            written_step = torch.full(x.shape, -1, dtype=torch.int32, device=x.device)    # 写入步
         else:
             init_mask_count = None
             fixed_conf = None
+            ema_conf = None
             gen_mask = None
+            written_step = None
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
             mask_index = (x == mask_token_id)
             logits = self(x, attention_mask, tok_idx).logits
             logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
             logits = generation_logits_hook_func(i, x, logits)
             mask_logits = logits[mask_index]
             s = timesteps[i + 1]
             if alg == 'origin':
                 p_transfer = 1 - s / t if i < steps - 1 else 1
                 x0 = torch.zeros_like(x[mask_index], device=self.device, dtype=torch.long) + mask_token_id
                 transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
                 )
                 x[mask_index] = x0.clone()
             else:
                 use_alg = conf_alg if rcr else alg
                 if use_alg == 'maskgit_plus':
                     confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
                     raise RuntimeError(f"Unknown alg/conf_alg: {use_alg}")
                 if rcr:
+                    # —— 贴近 vanilla 的历史置信度 RCR ——
                     self._apply_rcr_logic(
                         x=x,
                         x0=x0,
+                        conf_now=confidence.to(torch.float32),
                         mask_index=mask_index,
                         fixed_conf=fixed_conf,
+                        ema_conf=ema_conf,
                         gen_mask=gen_mask,
+                        written_step=written_step,
                         init_mask_count=init_mask_count,
                         mask_token_id=mask_token_id,
                         step=i,
                         total_steps=steps,
                         s=s, t=t,
+                        ema_beta=0.8,
                     )
                 else:
+                    # —— vanilla：本步 top-k 永久确认 ——
                     avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
                     ratio = (1.0 - (s.item() / t.item())) if i < steps - 1 else 1.0
                     number_transfer_tokens = int(avg_mask_now * ratio)
                         x[row_indices, transfer_index] = x_[row_indices, transfer_index]
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None:
                 histories.append(x.clone())
         if return_dict_in_generate:
+            return DreamModelOutput(sequences=x, history=histories)
         else:
             return x