[python] levenshtein
Viewer
*** This page was generated with the meta tag "noindex, nofollow". This happened because you selected this option before saving or the system detected it as spam. This means that this page will never get into the search engines and the search bot will not crawl it. There is nothing to worry about, you can still share it with anyone.
- import Levenshtein
- import xml.etree.ElementTree as ET
- def compare_strings(string1, string2):
- # Calculate Levenshtein distance
- distance = Levenshtein.distance(string1, string2)
- max_len = max(len(string1), len(string2))
- # Calculate similarity ratio as 1 - (distance / max_length)
- similarity_ratio = 1 - (distance / max_len)
- return similarity_ratio * 100
- def extract_text_from_xml(xml_string, excluded_tags=None):
- root = ET.fromstring(xml_string)
- text = {}
- for elem in root.iter():
- if excluded_tags and elem.tag in excluded_tags:
- continue
- if elem.tag not in text:
- text[elem.tag] = []
- if elem.text:
- text[elem.tag].append(elem.text.strip())
- return text
- def read_file_content(file_path):
- with open(file_path, 'r', encoding='utf-8') as file:
- return file.read()
- def comparaison(xml_file1, xml_file2, excluded_tags=None):
- xml_content1 = read_file_content(xml_file1)
- xml_content2 = read_file_content(xml_file2)
- text1 = extract_text_from_xml(xml_content1, excluded_tags)
- text2 = extract_text_from_xml(xml_content2, excluded_tags)
- similarity_ratios = {}
- for tag in text1:
- if tag in text2:
- text1_tag = ' '.join(text1[tag])
- text2_tag = ' '.join(text2[tag])
- similarity_ratios[tag] = compare_strings(text1_tag, text2_tag)
- else:
- similarity_ratios[tag] = 0.0
- # Calculate the average similarity ratio
- fields_to_average = ['name', 'mail', 'affiliation']
- total_ratio = sum(similarity_ratios[field] for field in fields_to_average)
- average_ratio = total_ratio / len(fields_to_average)
- similarity_ratios['auteurs'] = average_ratio
- del similarity_ratios['name']
- del similarity_ratios['mail']
- del similarity_ratios['affiliation']
- total_ratio = sum(similarity_ratios.values())
- average_ratio = total_ratio / len(similarity_ratios)
- similarity_ratios['total'] = average_ratio
- similarity_ratios['filename'] = xml_file1
- desired_order = ['filename','preamble', 'titre', 'auteurs', 'introduction', 'abstract', 'conclusion', 'biblio', 'total']
- ordered_dict = {key: similarity_ratios[key] for key in desired_order}
- return ordered_dict
Editor
You can edit this paste and save as new: