# Source code for responsibly.we.utils

```import math

import gensim
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from six import string_types
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score

WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
gensim.models.keyedvectors.BaseKeyedVectors,
gensim.models.fasttext.FastText,
gensim.models.word2vec.Word2Vec,
gensim.models.base_any2vec.BaseWordEmbeddingsModel,)  # pylint: disable=line-too-long

def round_to_extreme(value, digits=2):
place = 10**digits
new_value = math.ceil(abs(value) * place) / place
if value < 0:
new_value = -new_value
return new_value

[docs]def normalize(v):
"""Normalize a 1-D vector."""
if v.ndim != 1:
raise ValueError('v should be 1-D, {}-D was given'.format(
v.ndim))
norm = np.linalg.norm(v)
if norm == 0:
return v
return v / norm

[docs]def cosine_similarity(v, u):
"""Calculate the cosine similarity between two vectors."""
v_norm = np.linalg.norm(v)
u_norm = np.linalg.norm(u)
similarity = v @ u / (v_norm * u_norm)
return similarity

[docs]def project_vector(v, u):
"""Projecting the vector v onto direction u."""
normalize_u = normalize(u)
return (v @ normalize_u) * normalize_u

[docs]def reject_vector(v, u):
"""Rejecting the vector v onto direction u."""
return v - project_vector(v, u)

[docs]def project_reject_vector(v, u):
"""Projecting and rejecting the vector v onto direction u."""
projected_vector = project_vector(v, u)
rejected_vector = v - projected_vector
return projected_vector, rejected_vector

[docs]def project_params(u, v):
"""Projecting and rejecting the vector v onto direction u with scalar."""
normalize_u = normalize(u)
projection = (v @ normalize_u)
projected_vector = projection * normalize_u
rejected_vector = v - projected_vector
return projection, projected_vector, rejected_vector

[docs]def cosine_similarities_by_words(model, word, words):
"""Compute cosine similarities between a word and a set of other words."""

assert isinstance(word, string_types), \
'The arguemnt `word` should be a string.'
assert not isinstance(words, string_types), \
'The argument `words` should not be a string.'

vec = model[word]
vecs = [model[w] for w in words]
return model.cosine_similarities(vec, vecs)

def update_word_vector(model, word, new_vector):
model.vectors[model.vocab[word].index] = new_vector
if model.vectors_norm is not None:
model.vectors_norm[model.vocab[word].index] = normalize(new_vector)

def generate_one_word_forms(word):
return [word.lower(), word.upper(), word.title()]

def generate_words_forms(words):
return sum([generate_one_word_forms(word) for word in words], [])

def take_two_sides_extreme_sorted(df, n_extreme,
part_column=None,
tail_value=''):
tail_df = df.tail(n_extreme)[:]

if part_column is not None:
tail_df[part_column] = tail_value

.drop_duplicates()
.reset_index(drop=True))

def assert_gensim_keyed_vectors(model):
if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
type_names = (model_type.__name__
for model_type in WORD_EMBEDDING_MODEL_TYPES)
raise TypeError('model should be on of the types'
' ({}), not {}.'
.format(', '.join(type_names),
type(model)))

[docs]def most_similar(model, positive=None, negative=None,
topn=10, restrict_vocab=None, indexer=None,
unrestricted=True):
"""
Find the top-N most similar words.

Positive words contribute positively towards the similarity,
negative words negatively.

This function computes cosine similarity between a simple mean
of the projection weight vectors of the given words and
the vectors for each word in the model.
The function corresponds to the `word-analogy` and `distance`
scripts in the original word2vec implementation.

Based on Gensim implementation.

:param model: Word embedding model of ``gensim.model.KeyedVectors``.
:param list positive: List of words that contribute positively.
:param list negative: List of words that contribute negatively.
:param int topn: Number of top-N similar words to return.
:param int restrict_vocab: Optional integer which limits the
range of vectors
which are searched for most-similar values.
For example, restrict_vocab=10000 would
only check the first 10000 word vectors
in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary
by descending frequency.)
:param bool unrestricted: Whether to restricted the most
similar words to be not from
the positive or negative word list.
:return: Sequence of (word, similarity).
"""
if topn is not None and topn < 1:
return []

if positive is None:
positive = []
if negative is None:
negative = []

model.init_sims()

if (isinstance(positive, string_types)
and not negative):
# allow calls like most_similar('dog'),
# as a shorthand for most_similar(['dog'])
positive = [positive]

if ((isinstance(positive, string_types) and negative)
or (isinstance(negative, string_types) and positive)):
raise ValueError('If positives and negatives are given, '
'both should be lists!')

# default to 1.0 for positive and -1.0 for negative words
positive = [
(word, 1.0) if isinstance(word, string_types + (np.ndarray,))
else word
for word in positive
]
negative = [
(word, -1.0) if isinstance(word, string_types + (np.ndarray,))
else word
for word in negative
]

# compute the weighted average of all words
all_words, mean = set(), []
for word, weight in positive + negative:
if isinstance(word, np.ndarray):
mean.append(weight * word)
else:
mean.append(weight * model.word_vec(word, use_norm=True))
if word in model.vocab:

if not mean:
raise ValueError("Cannot compute similarity with no input.")
mean = gensim.matutils.unitvec(np.array(mean)
.mean(axis=0)).astype(float)

if indexer is not None:
return indexer.most_similar(mean, topn)

limited = (model.vectors_norm if restrict_vocab is None
else model.vectors_norm[:restrict_vocab])
dists = limited @ mean

if topn is None:
return dists

best = gensim.matutils.argsort(dists,
topn=topn + len(all_words),
reverse=True)

# if not unrestricted, then ignore (don't return)
# words from the input
result = [(model.index2word[sim], float(dists[sim]))
for sim in best
if unrestricted or sim not in all_words]

return result[:topn]

def get_seed_vector(seed, bias_word_embedding):

if seed == 'direction':
positive_end = bias_word_embedding.positive_end
negative_end = bias_word_embedding.negative_end
bias_word_embedding._is_direction_identified()  # pylint: disable=protected-access
seed_vector = bias_word_embedding.direction
else:
if seed == 'ends':
positive_end = bias_word_embedding.positive_end
negative_end = bias_word_embedding.negative_end

else:
positive_end, negative_end = seed

seed_vector = normalize(bias_word_embedding.model[positive_end]
- bias_word_embedding.model[negative_end])

return seed_vector, positive_end, negative_end

def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):

if ax is None:
_, ax = plt.subplots(figsize=(10, 5))

y_cluster = (KMeans(n_clusters=2, random_state=random_state)
.fit_predict(X))

embedded_vectors = (TSNE(n_components=2, random_state=random_state)
.fit_transform(X))

for y_value in np.unique(y_cluster):