dn6 HF Staff commited on
Commit
fac52b8
·
verified ·
1 Parent(s): ad1e96f

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +46 -0
  2. block.py +226 -0
  3. modular_config.json +7 -0
README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Florence2 Image Annotator
2
+
3
+ This is a custom block designed to annotate images via text prompts using the [Florence2]("https://huggingface.co/microsoft/Florence-2-large") model. The model can be used as a processor to generate inpainting masks or bounding box annotations.
4
+
5
+
6
+ # How to use
7
+
8
+ ```python
9
+ import torch
10
+ from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
11
+ from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
12
+ from diffusers.utils import load_image
13
+
14
+ # fetch the Florence2 image annotator block that will create our mask
15
+ image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence2-image-annotator", trust_remote_code=True)
16
+
17
+ my_blocks = INPAINT_BLOCKS.copy()
18
+ # insert the annotation block before the image encoding step
19
+ my_blocks.insert("image_annotator", image_annotator_block, 1)
20
+
21
+ # Create our initial set of inpainting blocks
22
+ blocks = SequentialPipelineBlocks.from_blocks_dict(my_blocks)
23
+
24
+ repo_id = "diffusers-internal-dev/modular-sdxl-inpainting"
25
+ pipe = blocks.init_pipeline(repo_id)
26
+ pipe.load_default_components(torch_dtype=torch.float16, device_map="cuda", trust_remote_code=True)
27
+
28
+ image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true")
29
+ image = image.resize((1024, 1024))
30
+
31
+ prompt = ["A red car"]
32
+ annotation_task = "<REFERRING_EXPRESSION_SEGMENTATION>"
33
+ annotation_prompt = ["the car"]
34
+
35
+ output = pipe(
36
+ prompt=prompt,
37
+ image=image,
38
+ annotation_task=annotation_task,
39
+ annotation_prompt=annotation_prompt,
40
+ annotation_output_type="mask_image",
41
+ num_inference_steps=35,
42
+ guidance_scale=7.5,
43
+ strength=0.95,
44
+ output_type="pil",
45
+ )
46
+ ```
block.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+
3
+ import numpy as np
4
+ import torch
5
+ from diffusers.modular_pipelines import (
6
+ ComponentSpec,
7
+ InputParam,
8
+ ModularPipelineBlocks,
9
+ OutputParam,
10
+ PipelineState,
11
+ )
12
+ from PIL import Image, ImageDraw
13
+ from transformers import Florence2ForConditionalGeneration, AutoProcessor
14
+
15
+
16
+ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
17
+ @property
18
+ def expected_components(self):
19
+ return [
20
+ ComponentSpec(
21
+ name="image_annotator",
22
+ type_hint=Florence2ForConditionalGeneration,
23
+ repo="florence-community/Florence-2-base-ft",
24
+ ),
25
+ ComponentSpec(
26
+ name="image_annotator_processor",
27
+ type_hint=AutoProcessor,
28
+ repo="florence-community/Florence-2-base-ft",
29
+ ),
30
+ ]
31
+
32
+ @property
33
+ def inputs(self) -> List[InputParam]:
34
+ return [
35
+ InputParam(
36
+ "image",
37
+ type_hint=Union[Image.Image, List[Image.Image]],
38
+ required=True,
39
+ description="Image(s) to annotate",
40
+ ),
41
+ InputParam(
42
+ "annotation_task",
43
+ type_hint=Union[str, List[str]],
44
+ required=True,
45
+ default="<REFERRING_EXPRESSION_SEGMENTATION>",
46
+ description="""Annotation Task to perform on the image.
47
+ Supported Tasks:
48
+
49
+ <OD>
50
+ <REFERRING_EXPRESSION_SEGMENTATION>
51
+ <CAPTION>
52
+ <DETAILED_CAPTION>
53
+ <MORE_DETAILED_CAPTION>
54
+ <DENSE_REGION_CAPTION>
55
+ <CAPTION_TO_PHRASE_GROUNDING>
56
+ <OPEN_VOCABULARY_DETECTION>
57
+
58
+ """,
59
+ ),
60
+ InputParam(
61
+ "annotation_prompt",
62
+ type_hint=Union[str, List[str]],
63
+ required=True,
64
+ description="""Annotation Prompt to provide more context to the task.
65
+ Can be used to detect or segment out specific elements in the image
66
+ """,
67
+ ),
68
+ InputParam(
69
+ "annotation_output_type",
70
+ type_hint=str,
71
+ required=True,
72
+ default="mask_image",
73
+ description="""Output type from annotation predictions. Availabe options are
74
+ annotation:
75
+ - raw annotation predictions from the model based on task type.
76
+ mask_image:
77
+ -black and white mask image for the given image based on the task type
78
+ mask_overlay:
79
+ - white mask overlayed on the original image
80
+ bounding_box:
81
+ - bounding boxes drawn on the original image
82
+ """,
83
+ ),
84
+ InputParam(
85
+ "annotation_overlay",
86
+ type_hint=bool,
87
+ required=True,
88
+ default=False,
89
+ description="",
90
+ ),
91
+ InputParam(
92
+ "fill",
93
+ type_hint=str,
94
+ default="white",
95
+ description="",
96
+ ),
97
+ ]
98
+
99
+ @property
100
+ def intermediate_outputs(self) -> List[OutputParam]:
101
+ return [
102
+ OutputParam(
103
+ "mask_image",
104
+ type_hint=Image,
105
+ description="Inpainting Mask for input Image(s)",
106
+ ),
107
+ OutputParam(
108
+ "annotations",
109
+ type_hint=dict,
110
+ description="Annotations Predictions for input Image(s)",
111
+ ),
112
+ OutputParam(
113
+ "image",
114
+ type_hint=Image,
115
+ description="Annotated input Image(s)",
116
+ ),
117
+ ]
118
+
119
+ def get_annotations(self, components, images, prompts, task):
120
+ task_prompts = [task + prompt for prompt in prompts]
121
+
122
+ inputs = components.image_annotator_processor(
123
+ text=task_prompts, images=images, return_tensors="pt"
124
+ ).to(components.image_annotator.device, components.image_annotator.dtype)
125
+
126
+ generated_ids = components.image_annotator.generate(
127
+ input_ids=inputs["input_ids"],
128
+ pixel_values=inputs["pixel_values"],
129
+ max_new_tokens=1024,
130
+ early_stopping=False,
131
+ do_sample=False,
132
+ num_beams=3,
133
+ )
134
+ annotations = components.image_annotator_processor.batch_decode(
135
+ generated_ids, skip_special_tokens=False
136
+ )
137
+ outputs = []
138
+ for image, annotation in zip(images, annotations):
139
+ outputs.append(
140
+ components.image_annotator_processor.post_process_generation(
141
+ annotation, task=task, image_size=(image.width, image.height)
142
+ )
143
+ )
144
+ return outputs
145
+
146
+ def prepare_mask(self, images, annotations, overlay=False, fill="white"):
147
+ masks = []
148
+ for image, annotation in zip(images, annotations):
149
+ mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
150
+ draw = ImageDraw.Draw(mask_image)
151
+
152
+ for _, _annotation in annotation.items():
153
+ if "polygons" in _annotation:
154
+ for polygon in _annotation["polygons"]:
155
+ polygon = np.array(polygon).reshape(-1, 2)
156
+ if len(polygon) < 3:
157
+ continue
158
+ polygon = polygon.reshape(-1).tolist()
159
+ draw.polygon(polygon, fill=fill)
160
+
161
+ elif "bbox" in _annotation:
162
+ bbox = _annotation["bbox"]
163
+ draw.rectangle(bbox, fill="white")
164
+
165
+ masks.append(mask_image)
166
+
167
+ return masks
168
+
169
+ def prepare_bounding_boxes(self, images, annotations):
170
+ outputs = []
171
+ for image, annotation in zip(images, annotations):
172
+ image_copy = image.copy()
173
+ draw = ImageDraw.Draw(image_copy)
174
+ for _, _annotation in annotation.items():
175
+ bbox = _annotation["bbox"]
176
+ label = _annotation["label"]
177
+
178
+ draw.rectangle(bbox, outline="red", width=3)
179
+ draw.text((bbox[0], bbox[1] - 20), label, fill="red")
180
+
181
+ outputs.append(image_copy)
182
+
183
+ return outputs
184
+
185
+ def prepare_inputs(self, images, prompts):
186
+ prompts = prompts or ""
187
+
188
+ if isinstance(images, Image.Image):
189
+ images = [images]
190
+ if isinstance(prompts, str):
191
+ prompts = [prompts]
192
+
193
+ if len(images) != len(prompts):
194
+ raise ValueError("Number of images and annotation prompts must match.")
195
+
196
+ return images, prompts
197
+
198
+ @torch.no_grad()
199
+ def __call__(self, components, state: PipelineState) -> PipelineState:
200
+ block_state = self.get_block_state(state)
201
+ images, annotation_task_prompt = self.prepare_inputs(
202
+ block_state.image, block_state.annotation_prompt
203
+ )
204
+ task = block_state.annotation_task
205
+ fill = block_state.fill
206
+
207
+ annotations = self.get_annotations(
208
+ components, images, annotation_task_prompt, task
209
+ )
210
+ block_state.annotations = annotations
211
+ if block_state.annotation_output_type == "mask_image":
212
+ block_state.mask_image = self.prepare_mask(images, annotations)
213
+ else:
214
+ block_state.mask_image = None
215
+
216
+ if block_state.annotation_output_type == "mask_overlay":
217
+ block_state.image = self.prepare_mask(
218
+ images, annotations, overlay=True, fill=fill
219
+ )
220
+
221
+ elif block_state.annotation_output_type == "bounding_box":
222
+ block_state.image = self.prepare_bounding_boxes(images, annotations)
223
+
224
+ self.set_block_state(state, block_state)
225
+
226
+ return components, state
modular_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Florence2ImageAnnotatorBlock",
3
+ "_diffusers_version": "0.35.1",
4
+ "auto_map": {
5
+ "ModularPipelineBlocks": "block.Florence2ImageAnnotatorBlock"
6
+ }
7
+ }