Saya berhasil mengonversi dua kolom menjadi matriks menggunakan perintah berikut.

dfb = datab.parse("a")

dfb

    Name       Product
0   Mike       Apple,pear
1   John       Orange,Banana
2   Bob        Banana
3   Connie      Pear


pd.get_dummies(dfb.Product).groupby(dfb.Name).apply(max)


    Apple,pear  Banana  Orange,Banana   Pear
Name                
Bob         0   1   0   0
Connie      0   0   0   1
John        0   0   1   0
Mike        1   0   0   0

Namun, matriks yang ingin saya miliki adalah sebagai berikut.

      Apple     Banana  Orange  Pear
Name                
Bob        0    1   0   0
Connie     0    0   0   1
John       0    1   1   0
Mike       1    0   0   1
3
sfhotmail 16 Agustus 2017, 08:01

2 jawaban

Jawaban Terbaik

Lihat Pengaturan Waktu Di Bawah

Opsi 1

pir0 = lambda dfb: pd.get_dummies(dfb.Name).T.dot(
    dfb.Product.str.title().str.get_dummies(','))
pir0(dfb)

        Apple  Banana  Orange  Pear
Bob         0       1       0     0
Connie      0       0       0     1
John        0       1       1     0
Mike        1       0       0     1

Opsi 2

from cytoolz import concat

def pir1(dfb):
    f0, u0 = pd.factorize(dfb.Name.values)
    p = [x.title().split(',') for x in dfb.Product.values.tolist()]
    l = [len(y) for y in p]
    f1, u1 = pd.factorize(list(concat(p)))
    n, m = u0.size, u1.size

    return pd.DataFrame(
        np.bincount(f0.repeat(l) * m + f1, minlength=n * m).reshape(n, m),
        u0, u1)

pir1(dfb)

        Apple  Pear  Orange  Banana
Mike        1     1       0       0
John        0     0       1       1
Bob         0     0       0       1
Connie      0     1       0       0

Opsi 3

def pir2(dfb):
    f0, u0 = pd.factorize(dfb.Name.values)
    p = [x.title().split(',') for x in dfb.Product.values.tolist()]
    l = [len(y) for y in p]
    f1, u1 = pd.factorize(list(concat(p)))
    n, m = u0.size, u1.size

    a = np.zeros((n, m), dtype=int)
    a[f0.repeat(l), f1] = 1

    return pd.DataFrame(a, u0, u1)

pir2(dfb)

        Apple  Pear  Orange  Banana
Mike        1     1       0       0
John        0     0       1       1
Bob         0     0       0       1
Connie      0     1       0       0

Waktu
Kode Di Bawah

results = pd.DataFrame(
    index=pd.Index([10, 30, 100, 300, 1000, 3000, 10000, 30000]),
    columns='pir0 pir1 pir2 jez0 jez1 jez2'.split()
)

for i in results.index:
    d = pd.concat([dfb] * i, ignore_index=True)
    for j in results.columns:
        stmt = '{}(d)'.format(j)
        setp = 'from __main__ import d, {}'.format(j)
        results.set_value(i, j, timeit(stmt, setp, number=20))

ax = results.plot(loglog=True)
ax.legend(ncol=2)

enter image description here

pir0 = lambda dfb: pd.get_dummies(dfb.Name).T.dot(dfb.Product.str.title().str.get_dummies(',')).astype(bool).astype(int)

from cytoolz import concat

def pir1(dfb):
    f0, u0 = pd.factorize(dfb.Name.values)
    p = [x.title().split(',') for x in dfb.Product.values.tolist()]
    l = [len(y) for y in p]
    f1, u1 = pd.factorize(list(concat(p)))
    n, m = u0.size, u1.size

    return pd.DataFrame(
        np.bincount(f0.repeat(l) * m + f1, minlength=n * m).reshape(n, m).astype(bool).astype(int),
        u0, u1)

def pir2(dfb):
    f0, u0 = pd.factorize(dfb.Name.values)
    p = [x.title().split(',') for x in dfb.Product.values.tolist()]
    l = [len(y) for y in p]
    f1, u1 = pd.factorize(list(concat(p)))
    n, m = u0.size, u1.size

    a = np.zeros((n, m), dtype=int)
    a[f0.repeat(l), f1] = 1

    return pd.DataFrame(a, u0, u1)

jez0 = lambda dfb: dfb.set_index('Name').Product.str.get_dummies(',')

jez1 = lambda dfb: pd.get_dummies(
    dfb.set_index('Name').Product.str.split(',', expand=True),
    prefix='', prefix_sep='').groupby(axis=1, level=0).sum()

def jez2(dfb):
    mlb = MultiLabelBinarizer()
    return pd.DataFrame(
        mlb.fit_transform(dfb.Product.str.split(',')),
        dfb.Name, mlb.classes_
    )
2
piRSquared 16 Agustus 2017, 07:14