import collections
import random
import numpy as np
[docs]def absolute_proportions(proportions, count):
"""
Split a given integer into n parts according to len(proportions) so they sum up to count and
match the given proportions.
Args:
proportions (dict): Dict of proportions, with a identifier as key.
Returns:
dict: Dictionary with absolute proportions and same identifiers as key.
Example::
>>> absolute_proportions({'train': 0.5, 'test': 0.5}, 100)
{'train': 50, 'test': 50}
"""
# first create absolute values by flooring non-integer portions
relative_sum = sum(proportions.values())
abs_proportions = {
idx: int(count / relative_sum * prop_value)
for idx, prop_value in proportions.items()
}
# Now distribute the rest value randomly over the different parts
absolute_sum = sum(abs_proportions.values())
rest_value = count - absolute_sum
subset_keys = sorted(list(proportions.keys()))
for i in range(rest_value):
key = subset_keys[i % len(subset_keys)]
abs_proportions[key] += 1
return abs_proportions
[docs]def split_identifiers(identifiers, proportions, seed=None):
"""
Split the given identifiers by the given proportions.
This function is deterministic, given the same seed.
First the identifiers are sorted before shuffled using the given seed.
Args:
identifiers (list): List of identifiers (str).
proportions (dict): A dictionary containing the proportions with the identifier from the
input as key.
seed (int): Seed to use for random operations.
Returns:
dict: Dictionary containing a list of identifiers per part with the same key as the
proportions dict.
Example::
>>> split_identifiers(
>>> identifiers=['a', 'b', 'c', 'd'],
>>> proportions={'melvin' : 0.5, 'timmy' : 0.5}
>>> )
{'melvin' : ['a', 'c'], 'timmy' : ['b', 'd']}
"""
identifiers = sorted(identifiers)
# Shuffle to get random order
rand = random.Random()
rand.seed(a=seed)
rand.shuffle(identifiers)
abs_proportions = absolute_proportions(proportions, len(identifiers))
parts = {}
start_index = 0
for idx in sorted(abs_proportions.keys()):
proportion = abs_proportions[idx]
parts[idx] = identifiers[start_index:start_index + proportion]
start_index += proportion
return parts
[docs]def get_identifiers_splitted_by_weights(identifiers, proportions, seed=None):
"""
Divide the given identifiers based on the given proportions. But instead of randomly split
the identifiers it is based on category weights. Every identifier has a weight for any
number of categories. The target is, to split the identifiers in a way, so the sum of
category k within part x is proportional to the sum of category x over all parts
according to the given proportions. This is done by greedily insert the identifiers step by
step in a part which has free space (weight). If there are no fitting parts anymore, the one
with the least weight exceed is used.
This function is deterministic, given the same seed.
First the identifiers are sorted before shuffled using the given seed.
Args:
identifiers (dict): A dictionary containing the weights for each identifier (key). Per
item a dictionary of weights per category is given.
proportions (dict): Dict of proportions, with a identifier as key.
seed (int): Seed to use for random operations.
Returns:
dict: Dictionary containing a list of identifiers per part with the same key as the proportions dict.
Example::
>>> identifiers = {
>>> 'a': {'music': 2, 'speech': 1},
>>> 'b': {'music': 5, 'speech': 2},
>>> 'c': {'music': 2, 'speech': 4},
>>> 'd': {'music': 1, 'speech': 4},
>>> 'e': {'music': 3, 'speech': 4}
>>> }
>>> proportions = {
>>> "train" : 0.6,
>>> "dev" : 0.2,
>>> "test" : 0.2
>>> }
>>> get_identifiers_splitted_by_weights(identifiers, proportions)
{
'train': ['a', 'b', 'd'],
'dev': ['c'],
'test': ['e']
}
"""
identifier_keys = sorted(identifiers.keys())
# Shuffle to get random order
rand = random.Random()
rand.seed(a=seed)
rand.shuffle(identifier_keys)
# Get total weight per category
sum_per_category = collections.defaultdict(int)
for identifier, cat_weights in identifiers.items():
for category, weight in cat_weights.items():
sum_per_category[category] += weight
target_weights_per_part = collections.defaultdict(dict)
# Get target weight for each part and category
for category, total_weight in sum_per_category.items():
abs_proportions = absolute_proportions(proportions, total_weight)
for idx, proportion in abs_proportions.items():
target_weights_per_part[idx][category] = proportion
# Distribute items greedily
part_ids = sorted(list(proportions.keys()))
current_weights_per_part = {
idx: collections.defaultdict(int)
for idx in part_ids
}
result = collections.defaultdict(list)
for identifier in identifier_keys:
cat_weights = identifiers[identifier]
target_part = None
current_part = 0
weight_over_target = collections.defaultdict(int)
# Search for fitting part
while target_part is None and current_part < len(part_ids):
free_space = True
part_id = part_ids[current_part]
part_weights = current_weights_per_part[part_id]
for category, weight in cat_weights.items():
target_weight = target_weights_per_part[part_id][category]
current_weight = part_weights[category]
weight_diff = target_weight - current_weight - weight
weight_over_target[part_id] += weight_diff
if weight_diff < 0:
free_space = False
# If weight doesn't exceed target, place identifier in part
if free_space:
target_part = part_id
current_part += 1
# If not found fitting part, select the part with the least overweight
if target_part is None:
target_part = sorted(
weight_over_target.items(),
key=lambda x: x[1],
reverse=True
)[0][0]
result[target_part].append(identifier)
for category, weight in cat_weights.items():
current_weights_per_part[target_part][category] += weight
return result
[docs]def select_balanced_subset(items, select_count, categories, select_count_values=None, seed=None):
"""
Select items so the summed category weights are balanced.
Each item has a dictionary containing the category weights.
Items are selected until ``select_count`` is reached.
The value that is added to ``select_count`` for an item can be defined in the dictionary ``select_count_values``.
If this is not defined it is assumed to be 1, which means `select_count` items are selected.
Args:
items (dict): Dictionary containing items with category weights.
select_count (float): Value to reach for selected items.
categories (list): List of all categories.
select_count_values (dict): The select_count values to be used.
For example an utterance with multiple labels:
The category weights (label-lengths) are used for balance,
but the utterance-duration is used for reaching the select_count.
Returns:
list: List of item ids, containing ``number_of_items`` (or ``len(items)`` if smaller).
Example:
>>> items = {
>>> 'utt-1' : {'m': 1, 's': 0, 'n': 0},
>>> 'utt-2' : {'m': 0, 's': 2, 'n': 1},
>>> ...
>>> }
>>> select_balanced_subset(items, 5)
>>> ['utt-1', 'utt-3', 'utt-9', 'utt-33', 'utt-34']
"""
rand = random.Random()
rand.seed(seed)
if select_count_values is None:
select_count_values = {item_id: 1 for item_id in items.keys()}
if sum(select_count_values.values()) < select_count:
return list(items.keys())
available_item_ids = sorted(list(items.keys()))
weight_per_category = np.zeros(len(categories))
selected_item_ids = []
available_item_weights = []
current_select_count = 0
rand.shuffle(available_item_ids)
# Create dict with weights as vectors
for item_id in available_item_ids:
weights = items[item_id]
all_weights = np.zeros(len(categories))
for category, weight in weights.items():
all_weights[categories.index(category)] = float(weight)
available_item_weights.append(all_weights)
# Always add best next item
while current_select_count < select_count:
best_item_index = 0
best_item_id = None
best_item_dist = float('inf')
current_item_index = 0
while current_item_index < len(available_item_ids) and best_item_dist > 0:
item_id = available_item_ids[current_item_index]
item_weights = available_item_weights[current_item_index]
temp_total_weights = weight_per_category + item_weights
dist = temp_total_weights.var()
if dist < best_item_dist:
best_item_index = current_item_index
best_item_dist = dist
best_item_id = item_id
current_item_index += 1
weight_per_category += available_item_weights[best_item_index]
selected_item_ids.append(best_item_id)
del available_item_ids[best_item_index]
del available_item_weights[best_item_index]
current_select_count += select_count_values[best_item_id]
return selected_item_ids