@@ -3683,7 +3683,7 @@ zio_ddt_child_write_done(zio_t *zio)
36833683 ddt_phys_variant_t v = DDT_PHYS_VARIANT (ddt , p );
36843684 ddt_univ_phys_t * ddp = dde -> dde_phys ;
36853685
3686- ddt_enter ( ddt );
3686+ mutex_enter ( & dde -> dde_io -> dde_io_lock );
36873687
36883688 /* we're the lead, so once we're done there's no one else outstanding */
36893689 if (dde -> dde_io -> dde_lead_zio [p ] == zio )
@@ -3700,21 +3700,24 @@ zio_ddt_child_write_done(zio_t *zio)
37003700 ddt_phys_unextend (ddp , orig , v );
37013701 ddt_phys_clear (orig , v );
37023702
3703+ mutex_exit (& dde -> dde_io -> dde_io_lock );
3704+
3705+ /*
3706+ * Undo the optimistic refcount increments that were done in
3707+ * zio_ddt_write() for all non-DDT-child parents. Since errors
3708+ * are rare, taking the global lock here is acceptable.
3709+ */
3710+ ddt_enter (ddt );
3711+ zio_t * pio ;
3712+ zl = NULL ;
3713+ while ((pio = zio_walk_parents (zio , & zl )) != NULL ) {
3714+ if (!(pio -> io_flags & ZIO_FLAG_DDT_CHILD ))
3715+ ddt_phys_decref (ddp , v );
3716+ }
37033717 ddt_exit (ddt );
37043718 return ;
37053719 }
37063720
3707- /*
3708- * Add references for all dedup writes that were waiting on the
3709- * physical one, skipping any other physical writes that are waiting.
3710- */
3711- zio_t * pio ;
3712- zl = NULL ;
3713- while ((pio = zio_walk_parents (zio , & zl )) != NULL ) {
3714- if (!(pio -> io_flags & ZIO_FLAG_DDT_CHILD ))
3715- ddt_phys_addref (ddp , v );
3716- }
3717-
37183721 /*
37193722 * We've successfully added new DVAs to the entry. Clear the saved
37203723 * state or, if there's still outstanding IO, remember it so we can
@@ -3725,7 +3728,7 @@ zio_ddt_child_write_done(zio_t *zio)
37253728 else
37263729 ddt_phys_copy (orig , ddp , v );
37273730
3728- ddt_exit ( ddt );
3731+ mutex_exit ( & dde -> dde_io -> dde_io_lock );
37293732}
37303733
37313734static void
@@ -3753,7 +3756,7 @@ zio_ddt_child_write_ready(zio_t *zio)
37533756 if (zio -> io_error != 0 )
37543757 return ;
37553758
3756- ddt_enter ( ddt );
3759+ mutex_enter ( & dde -> dde_io -> dde_io_lock );
37573760
37583761 ddt_phys_extend (dde -> dde_phys , v , zio -> io_bp );
37593762
@@ -3764,7 +3767,7 @@ zio_ddt_child_write_ready(zio_t *zio)
37643767 ddt_bp_fill (dde -> dde_phys , v , pio -> io_bp , zio -> io_txg );
37653768 }
37663769
3767- ddt_exit ( ddt );
3770+ mutex_exit ( & dde -> dde_io -> dde_io_lock );
37683771}
37693772
37703773static zio_t *
@@ -3799,11 +3802,11 @@ zio_ddt_write(zio_t *zio)
37993802 dde = ddt_lookup (ddt , bp , B_FALSE );
38003803 if (dde == NULL ) {
38013804 /* DDT size is over its quota so no new entries */
3805+ ddt_exit (ddt );
38023806 zp -> zp_dedup = B_FALSE ;
38033807 BP_SET_DEDUP (bp , B_FALSE );
38043808 if (zio -> io_bp_override == NULL )
38053809 zio -> io_pipeline = ZIO_WRITE_PIPELINE ;
3806- ddt_exit (ddt );
38073810 return (zio );
38083811 }
38093812
@@ -3814,6 +3817,7 @@ zio_ddt_write(zio_t *zio)
38143817 * we can't resolve it, so just convert to an ordinary write.
38153818 * (And automatically e-mail a paper to Nature?)
38163819 */
3820+ ddt_exit (ddt );
38173821 if (!(zio_checksum_table [zp -> zp_checksum ].ci_flags &
38183822 ZCHECKSUM_FLAG_DEDUP )) {
38193823 zp -> zp_checksum = spa_dedup_checksum (spa );
@@ -3826,7 +3830,6 @@ zio_ddt_write(zio_t *zio)
38263830 }
38273831 ASSERT (!BP_GET_DEDUP (bp ));
38283832 zio -> io_pipeline = ZIO_WRITE_PIPELINE ;
3829- ddt_exit (ddt );
38303833 return (zio );
38313834 }
38323835
@@ -3877,10 +3880,15 @@ zio_ddt_write(zio_t *zio)
38773880 uint8_t parent_dvas = 0 ;
38783881
38793882 /*
3880- * What we do next depends on whether or not there's IO outstanding that
3881- * will update this entry.
3883+ * What we do next depends on whether or not there's IO outstanding
3884+ * that will update this entry. If dde_io exists, we need to hold
3885+ * its lock to safely check and use dde_lead_zio.
38823886 */
3883- if (dde -> dde_io == NULL || dde -> dde_io -> dde_lead_zio [p ] == NULL ) {
3887+ ddt_entry_io_t * dde_io = dde -> dde_io ;
3888+ if (dde_io != NULL )
3889+ mutex_enter (& dde_io -> dde_io_lock );
3890+
3891+ if (dde_io == NULL || dde_io -> dde_lead_zio [p ] == NULL ) {
38843892 /*
38853893 * No IO outstanding, so we only need to worry about ourselves.
38863894 */
@@ -3895,6 +3903,8 @@ zio_ddt_write(zio_t *zio)
38953903 * block and leave.
38963904 */
38973905 if (have_dvas == 0 ) {
3906+ if (dde_io != NULL )
3907+ mutex_exit (& dde_io -> dde_io_lock );
38983908 ASSERT (BP_GET_BIRTH (bp ) == txg );
38993909 ASSERT (BP_EQUAL (bp , zio -> io_bp_override ));
39003910 ddt_phys_extend (ddp , v , bp );
@@ -3923,6 +3933,9 @@ zio_ddt_write(zio_t *zio)
39233933 * then we can just use them as-is.
39243934 */
39253935 if (have_dvas >= need_dvas ) {
3936+ if (dde_io != NULL )
3937+ mutex_exit (& dde_io -> dde_io_lock );
3938+
39263939 /*
39273940 * For rewrite operations, try preserving the original
39283941 * logical birth time. If the result matches the
@@ -3934,8 +3947,8 @@ zio_ddt_write(zio_t *zio)
39343947 ddt_bp_fill (ddp , v , bp , orig_logical_birth );
39353948 if (BP_EQUAL (bp , & zio -> io_bp_orig )) {
39363949 /* We can skip accounting. */
3937- zio -> io_flags |= ZIO_FLAG_NOPWRITE ;
39383950 ddt_exit (ddt );
3951+ zio -> io_flags |= ZIO_FLAG_NOPWRITE ;
39393952 return (zio );
39403953 }
39413954 }
@@ -3997,7 +4010,16 @@ zio_ddt_write(zio_t *zio)
39974010 * missed out.
39984011 */
39994012 ddt_bp_fill (ddp , v , bp , txg );
4000- zio_add_child (zio , dde -> dde_io -> dde_lead_zio [p ]);
4013+ piggyback :
4014+ zio_add_child (zio , dde_io -> dde_lead_zio [p ]);
4015+
4016+ /*
4017+ * Optimistically increment refcount for this parent.
4018+ * If the write fails, zio_ddt_child_write_done() will
4019+ * decrement for all non-DDT-child parents.
4020+ */
4021+ ddt_phys_addref (ddp , v );
4022+ mutex_exit (& dde_io -> dde_io_lock );
40014023 ddt_exit (ddt );
40024024 return (zio );
40034025 }
@@ -4023,11 +4045,8 @@ zio_ddt_write(zio_t *zio)
40234045 ASSERT (pio );
40244046 parent_dvas = pio -> io_prop .zp_copies ;
40254047
4026- if (parent_dvas >= need_dvas ) {
4027- zio_add_child (zio , dde -> dde_io -> dde_lead_zio [p ]);
4028- ddt_exit (ddt );
4029- return (zio );
4030- }
4048+ if (parent_dvas >= need_dvas )
4049+ goto piggyback ;
40314050
40324051 /*
40334052 * Still not enough, so we will need to issue to get the
@@ -4037,10 +4056,12 @@ zio_ddt_write(zio_t *zio)
40374056 }
40384057
40394058 if (is_ganged ) {
4059+ if (dde_io != NULL )
4060+ mutex_exit (& dde_io -> dde_io_lock );
4061+ ddt_exit (ddt );
40404062 zp -> zp_dedup = B_FALSE ;
40414063 BP_SET_DEDUP (bp , B_FALSE );
40424064 zio -> io_pipeline = ZIO_WRITE_PIPELINE ;
4043- ddt_exit (ddt );
40444065 return (zio );
40454066 }
40464067
@@ -4070,21 +4091,45 @@ zio_ddt_write(zio_t *zio)
40704091 * We are the new lead zio, because our parent has the highest
40714092 * zp_copies that has been requested for this entry so far.
40724093 */
4073- ddt_alloc_entry_io (dde );
4074- if (dde -> dde_io -> dde_lead_zio [p ] == NULL ) {
4094+ if (dde_io == NULL ) {
4095+ /*
4096+ * New dde_io. No lock needed since no other thread can have
4097+ * a reference yet.
4098+ */
4099+ ddt_alloc_entry_io (dde );
4100+ dde_io = dde -> dde_io ;
40754101 /*
40764102 * First time out, take a copy of the stable entry to revert
40774103 * to if there's an error (see zio_ddt_child_write_done())
40784104 */
4079- ddt_phys_copy (& dde -> dde_io -> dde_orig_phys , dde -> dde_phys , v );
4105+ ddt_phys_copy (& dde_io -> dde_orig_phys , dde -> dde_phys , v );
4106+ dde_io -> dde_lead_zio [p ] = cio ;
40804107 } else {
4081- /*
4082- * Make the existing chain our child, because it cannot
4083- * complete until we have.
4084- */
4085- zio_add_child (cio , dde -> dde_io -> dde_lead_zio [p ]);
4108+ if (dde_io -> dde_lead_zio [p ] == NULL ) {
4109+ /*
4110+ * First time out, take a copy of the stable entry
4111+ * to revert to if there's an error (see
4112+ * zio_ddt_child_write_done())
4113+ */
4114+ ddt_phys_copy (& dde_io -> dde_orig_phys , dde -> dde_phys ,
4115+ v );
4116+ } else {
4117+ /*
4118+ * Make the existing chain our child, because it
4119+ * cannot complete until we have.
4120+ */
4121+ zio_add_child (cio , dde_io -> dde_lead_zio [p ]);
4122+ }
4123+ dde_io -> dde_lead_zio [p ] = cio ;
4124+ mutex_exit (& dde_io -> dde_io_lock );
40864125 }
4087- dde -> dde_io -> dde_lead_zio [p ] = cio ;
4126+
4127+ /*
4128+ * Optimistically increment the refcount for this dedup write.
4129+ * If the write fails, zio_ddt_child_write_done() will decrement
4130+ * for all non-DDT-child parents.
4131+ */
4132+ ddt_phys_addref (ddp , v );
40884133
40894134 ddt_exit (ddt );
40904135
0 commit comments