|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
|
|
def difference_hash(input_data): |
|
|
""" |
|
|
Computes the difference hash (dHash) for the given input data. |
|
|
|
|
|
Args: |
|
|
input_data (str): The input data as a string. |
|
|
|
|
|
Returns: |
|
|
str: The difference hash value as a hexadecimal string. |
|
|
""" |
|
|
|
|
|
|
|
|
lines = input_data.splitlines() |
|
|
|
|
|
|
|
|
columns = [line.split(',') for line in lines] |
|
|
|
|
|
|
|
|
df = pd.DataFrame(columns) |
|
|
|
|
|
|
|
|
for col in df.select_dtypes(include=['object']).columns: |
|
|
encoder = LabelEncoder() |
|
|
df[col] = encoder.fit_transform(df[col]) |
|
|
|
|
|
|
|
|
input_data = df.values |
|
|
|
|
|
|
|
|
differences = np.diff(input_data, axis=0) |
|
|
|
|
|
|
|
|
hash_bits = [1 if diff >= 0 else 0 for diff in differences.flatten()] |
|
|
|
|
|
|
|
|
hash_value = hex(int(''.join(str(bit) for bit in hash_bits), 2))[2:][:128] |
|
|
|
|
|
return hash_value |
|
|
|
|
|
def hamming_distance(hash1, hash2): |
|
|
""" |
|
|
Computes the Hamming distance between two hash values. |
|
|
|
|
|
Args: |
|
|
hash1 (str): The first hash value as a hexadecimal string. |
|
|
hash2 (str): The second hash value as a hexadecimal string. |
|
|
|
|
|
Returns: |
|
|
float: The normalized Hamming distance between the two hash values (between 0 and 1). |
|
|
""" |
|
|
|
|
|
int_hash1 = int(hash1, 16) |
|
|
int_hash2 = int(hash2, 16) |
|
|
|
|
|
|
|
|
xor_value = int_hash1 ^ int_hash2 |
|
|
|
|
|
|
|
|
hamming_dist = bin(xor_value).count('1') |
|
|
|
|
|
|
|
|
max_hamming_dist = max(bin(int_hash1).count('1'), bin(int_hash2).count('1')) |
|
|
|
|
|
|
|
|
normalized_hamming_dist = hamming_dist / max_hamming_dist |
|
|
|
|
|
return normalized_hamming_dist |
|
|
|
|
|
def jaccard_similarity(text1, text2): |
|
|
lines1 = text1.splitlines() |
|
|
lines2 = text2.splitlines() |
|
|
intersection = len(set(lines1).intersection(set(lines2))) |
|
|
union = len(set(lines1 + lines2)) |
|
|
if union == 0: |
|
|
return 1.0 |
|
|
return intersection / union |
|
|
|
|
|
|
|
|
def read_file(file_path): |
|
|
with open(file_path, 'r') as file: |
|
|
text = file.read() |
|
|
return text |
|
|
|
|
|
def compare_datasets(ref_file, compare_files): |
|
|
ref_text = read_file(ref_file.name) |
|
|
ref_simhash = difference_hash(ref_text) |
|
|
|
|
|
outputs = [] |
|
|
for file in compare_files: |
|
|
text = read_file(file.name) |
|
|
simhash_value = difference_hash(text) |
|
|
|
|
|
hamming_dist = hamming_distance(ref_simhash, simhash_value) |
|
|
jaccard_sim = jaccard_similarity(ref_text, text) |
|
|
|
|
|
|
|
|
if hamming_dist > 0.5: |
|
|
rate_of_change = 0.01 |
|
|
else: |
|
|
rate_of_change = 0.01 |
|
|
|
|
|
|
|
|
combined_similarity = max(0, min(1, (1 - rate_of_change * hamming_dist) * jaccard_sim)) |
|
|
|
|
|
print(f"For file {file}:") |
|
|
print("Combined Similarity:", combined_similarity) |
|
|
|
|
|
|
|
|
threshold = 0.1 |
|
|
|
|
|
outputs.append(f"For file {file.name}:\nCombined Similarity: {combined_similarity}\nThe datasets are {'similar' if combined_similarity >= threshold else 'dissimilar'}") |
|
|
|
|
|
return "\n\n".join(outputs) |
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=compare_datasets, |
|
|
inputs=[ |
|
|
gr.File(label="Reference File"), |
|
|
gr.File(label="Files to Compare", file_count="multiple"), |
|
|
], |
|
|
outputs=gr.Textbox(label="Comparison Results"), |
|
|
title="Dataset Similarity Comparison", |
|
|
description="Compare datasets for similarity using simhash and Jaccard similarity.", |
|
|
) |
|
|
|
|
|
interface.launch(share=True) |